41 files changed, 32088 insertions, 11112 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
new file mode 100644
index 00000000000..efea5d5c44c
--- /dev/null
+++ b/fs/ext4/Kconfig
@@ -0,0 +1,75 @@
+config EXT4_FS
+	tristate "The Extended 4 (ext4) filesystem"
+	select JBD2
+	select CRC16
+	select CRYPTO
+	select CRYPTO_CRC32C
+	help
+	  This is the next generation of the ext3 filesystem.
+
+	  Unlike the change from ext2 filesystem to ext3 filesystem,
+	  the on-disk format of ext4 is not forwards compatible with
+	  ext3; it is based on extent maps and it supports 48-bit
+	  physical block numbers.  The ext4 filesystem also supports delayed
+	  allocation, persistent preallocation, high resolution time stamps,
+	  and a number of other features to improve performance and speed
+	  up fsck time.  For more information, please see the web pages at
+	  http://ext4.wiki.kernel.org.
+
+	  The ext4 filesystem will support mounting an ext3
+	  filesystem; while there will be some performance gains from
+	  the delayed allocation and inode table readahead, the best
+	  performance gains will require enabling ext4 features in the
+	  filesystem, or formatting a new filesystem as an ext4
+	  filesystem initially.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called ext4.
+
+	  If unsure, say N.
+
+config EXT4_USE_FOR_EXT23
+	bool "Use ext4 for ext2/ext3 file systems"
+	depends on EXT4_FS
+	depends on EXT3_FS=n || EXT2_FS=n
+	default y
+	help
+	  Allow the ext4 file system driver code to be used for ext2 or
+	  ext3 file system mounts.  This allows users to reduce their
+	  compiled kernel size by using one file system driver for
+	  ext2, ext3, and ext4 file systems.
+
+config EXT4_FS_POSIX_ACL
+	bool "Ext4 POSIX Access Control Lists"
+	depends on EXT4_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+	bool "Ext4 Security Labels"
+	depends on EXT4_FS
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext4 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config EXT4_DEBUG
+	bool "EXT4 debugging support"
+	depends on EXT4_FS
+	help
+	  Enables run-time debugging support for the ext4 filesystem.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with a command such as:
+		echo 1 > /sys/module/ext4/parameters/mballoc_debug
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2..0310fec2ee3 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,13 @@
 # Makefile for the linux ext4-filesystem routines.
 #
 
-obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+obj-$(CONFIG_EXT4_FS) += ext4.o
 
-ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o migrate.o mballoc.o
+ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+		mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+		xattr_trusted.o inline.o
 
-ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
-ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
-ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY)	+= xattr_security.o
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
+ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index a8bae8cd1d5..d40c8dbbb0d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -9,8 +9,8 @@
 #include <linux/slab.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 #include "acl.h"
 
@@ -37,37 +37,45 @@ ext4_acl_from_disk(const void *value, size_t size)
 		return ERR_PTR(-EINVAL);
 	if (count == 0)
 		return NULL;
-	acl = posix_acl_alloc(count, GFP_KERNEL);
+	acl = posix_acl_alloc(count, GFP_NOFS);
 	if (!acl)
 		return ERR_PTR(-ENOMEM);
-	for (n=0; n < count; n++) {
+	for (n = 0; n < count; n++) {
 		ext4_acl_entry *entry =
 			(ext4_acl_entry *)value;
 		if ((char *)value + sizeof(ext4_acl_entry_short) > end)
 			goto fail;
 		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
 		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
-		switch(acl->a_entries[n].e_tag) {
-			case ACL_USER_OBJ:
-			case ACL_GROUP_OBJ:
-			case ACL_MASK:
-			case ACL_OTHER:
-				value = (char *)value +
-					sizeof(ext4_acl_entry_short);
-				acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
-				break;
 
-			case ACL_USER:
-			case ACL_GROUP:
-				value = (char *)value + sizeof(ext4_acl_entry);
-				if ((char *)value > end)
-					goto fail;
-				acl->a_entries[n].e_id =
-					le32_to_cpu(entry->e_id);
-				break;
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+				sizeof(ext4_acl_entry_short);
+			break;
 
-			default:
+		case ACL_USER:
+			value = (char *)value + sizeof(ext4_acl_entry);
+			if ((char *)value > end)
+				goto fail;
+			acl->a_entries[n].e_uid =
+				make_kuid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+		case ACL_GROUP:
+			value = (char *)value + sizeof(ext4_acl_entry);
+			if ((char *)value > end)
 				goto fail;
+			acl->a_entries[n].e_gid =
+				make_kgid(&init_user_ns,
+					  le32_to_cpu(entry->e_id));
+			break;
+
+		default:
+			goto fail;
 		}
 	}
 	if (value != end)
@@ -91,32 +99,37 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
 
 	*size = ext4_acl_size(acl->a_count);
 	ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
-			sizeof(ext4_acl_entry), GFP_KERNEL);
+			sizeof(ext4_acl_entry), GFP_NOFS);
 	if (!ext_acl)
 		return ERR_PTR(-ENOMEM);
 	ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
 	e = (char *)ext_acl + sizeof(ext4_acl_header);
-	for (n=0; n < acl->a_count; n++) {
+	for (n = 0; n < acl->a_count; n++) {
+		const struct posix_acl_entry *acl_e = &acl->a_entries[n];
 		ext4_acl_entry *entry = (ext4_acl_entry *)e;
-		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
-		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
-		switch(acl->a_entries[n].e_tag) {
-			case ACL_USER:
-			case ACL_GROUP:
-				entry->e_id =
-					cpu_to_le32(acl->a_entries[n].e_id);
-				e += sizeof(ext4_acl_entry);
-				break;
+		entry->e_tag  = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			e += sizeof(ext4_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			e += sizeof(ext4_acl_entry);
+			break;
 
-			case ACL_USER_OBJ:
-			case ACL_GROUP_OBJ:
-			case ACL_MASK:
-			case ACL_OTHER:
-				e += sizeof(ext4_acl_entry_short);
-				break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(ext4_acl_entry_short);
+			break;
 
-			default:
-				goto fail;
+		default:
+			goto fail;
 		}
 	}
 	return (char *)ext_acl;
@@ -126,68 +139,32 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
-static inline struct posix_acl *
-ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
-{
-	struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
-
-	spin_lock(&inode->i_lock);
-	if (*i_acl != EXT4_ACL_NOT_CACHED)
-		acl = posix_acl_dup(*i_acl);
-	spin_unlock(&inode->i_lock);
-
-	return acl;
-}
-
-static inline void
-ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
-		struct posix_acl *acl)
-{
-	spin_lock(&inode->i_lock);
-	if (*i_acl != EXT4_ACL_NOT_CACHED)
-		posix_acl_release(*i_acl);
-	*i_acl = posix_acl_dup(acl);
-	spin_unlock(&inode->i_lock);
-}
-
 /*
  * Inode operation get_posix_acl().
  *
  * inode->i_mutex: don't care
  */
-static struct posix_acl *
+struct posix_acl *
 ext4_get_acl(struct inode *inode, int type)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
 	int name_index;
 	char *value = NULL;
 	struct posix_acl *acl;
 	int retval;
 
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return NULL;
-
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			acl = ext4_iget_acl(inode, &ei->i_acl);
-			if (acl != EXT4_ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
-			break;
-
-		case ACL_TYPE_DEFAULT:
-			acl = ext4_iget_acl(inode, &ei->i_default_acl);
-			if (acl != EXT4_ACL_NOT_CACHED)
-				return acl;
-			name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
-			break;
-
-		default:
-			return ERR_PTR(-EINVAL);
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		BUG();
 	}
 	retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
 	if (retval > 0) {
-		value = kmalloc(retval, GFP_KERNEL);
+		value = kmalloc(retval, GFP_NOFS);
 		if (!value)
 			return ERR_PTR(-ENOMEM);
 		retval = ext4_xattr_get(inode, name_index, "", value, retval);
@@ -200,17 +177,9 @@ ext4_get_acl(struct inode *inode, int type)
 		acl = ERR_PTR(retval);
 	kfree(value);
 
-	if (!IS_ERR(acl)) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext4_iset_acl(inode, &ei->i_acl, acl);
-				break;
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
-			case ACL_TYPE_DEFAULT:
-				ext4_iset_acl(inode, &ei->i_default_acl, acl);
-				break;
-		}
-	}
 	return acl;
 }
 
@@ -220,43 +189,38 @@ ext4_get_acl(struct inode *inode, int type)
  * inode->i_mutex: down unless called from ext4_new_inode
  */
 static int
-ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 	     struct posix_acl *acl)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
 	int name_index;
 	void *value = NULL;
 	size_t size = 0;
 	int error;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-
-	switch(type) {
-		case ACL_TYPE_ACCESS:
-			name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
-			if (acl) {
-				mode_t mode = inode->i_mode;
-				error = posix_acl_equiv_mode(acl, &mode);
-				if (error < 0)
-					return error;
-				else {
-					inode->i_mode = mode;
-					ext4_mark_inode_dirty(handle, inode);
-					if (error == 0)
-						acl = NULL;
-				}
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			error = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (error < 0)
+				return error;
+			else {
+				inode->i_ctime = ext4_current_time(inode);
+				ext4_mark_inode_dirty(handle, inode);
+				if (error == 0)
+					acl = NULL;
 			}
-			break;
+		}
+		break;
 
-		case ACL_TYPE_DEFAULT:
-			name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
-			if (!S_ISDIR(inode->i_mode))
-				return acl ? -EACCES : 0;
-			break;
+	case ACL_TYPE_DEFAULT:
+		name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
 
-		default:
-			return -EINVAL;
+	default:
+		return -EINVAL;
 	}
 	if (acl) {
 		value = ext4_acl_to_disk(acl, &size);
@@ -268,40 +232,29 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 				      value, size, 0);
 
 	kfree(value);
-	if (!error) {
-		switch(type) {
-			case ACL_TYPE_ACCESS:
-				ext4_iset_acl(inode, &ei->i_acl, acl);
-				break;
+	if (!error)
+		set_cached_acl(inode, type, acl);
 
-			case ACL_TYPE_DEFAULT:
-				ext4_iset_acl(inode, &ei->i_default_acl, acl);
-				break;
-		}
-	}
 	return error;
 }
 
-static int
-ext4_check_acl(struct inode *inode, int mask)
+int
+ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
-
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl) {
-		int error = posix_acl_permission(inode, acl, mask);
-		posix_acl_release(acl);
-		return error;
-	}
+	handle_t *handle;
+	int error, retries = 0;
 
-	return -EAGAIN;
-}
+retry:
+	handle = ext4_journal_start(inode, EXT4_HT_XATTR,
+				    ext4_jbd2_credits_xattr(inode));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
-int
-ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return generic_permission(inode, mask, ext4_check_acl);
+	error = __ext4_set_acl(handle, inode, type, acl);
+	ext4_journal_stop(handle);
+	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+	return error;
 }
 
 /*
@@ -313,239 +266,23 @@ ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
 int
 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
-	struct posix_acl *acl = NULL;
-	int error = 0;
-
-	if (!S_ISLNK(inode->i_mode)) {
-		if (test_opt(dir->i_sb, POSIX_ACL)) {
-			acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
-		}
-		if (!acl)
-			inode->i_mode &= ~current->fs->umask;
-	}
-	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
-		struct posix_acl *clone;
-		mode_t mode;
-
-		if (S_ISDIR(inode->i_mode)) {
-			error = ext4_set_acl(handle, inode,
-					     ACL_TYPE_DEFAULT, acl);
-			if (error)
-				goto cleanup;
-		}
-		clone = posix_acl_clone(acl, GFP_KERNEL);
-		error = -ENOMEM;
-		if (!clone)
-			goto cleanup;
-
-		mode = inode->i_mode;
-		error = posix_acl_create_masq(clone, &mode);
-		if (error >= 0) {
-			inode->i_mode = mode;
-			if (error > 0) {
-				/* This is an extended ACL */
-				error = ext4_set_acl(handle, inode,
-						     ACL_TYPE_ACCESS, clone);
-			}
-		}
-		posix_acl_release(clone);
-	}
-cleanup:
-	posix_acl_release(acl);
-	return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext4_acl_chmod(struct inode *inode)
-{
-	struct posix_acl *acl, *clone;
+	struct posix_acl *default_acl, *acl;
 	int error;
 
-	if (S_ISLNK(inode->i_mode))
-		return -EOPNOTSUPP;
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return 0;
-	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
-	if (IS_ERR(acl) || !acl)
-		return PTR_ERR(acl);
-	clone = posix_acl_clone(acl, GFP_KERNEL);
-	posix_acl_release(acl);
-	if (!clone)
-		return -ENOMEM;
-	error = posix_acl_chmod_masq(clone, inode->i_mode);
-	if (!error) {
-		handle_t *handle;
-		int retries = 0;
+	error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+	if (error)
+		return error;
 
-	retry:
-		handle = ext4_journal_start(inode,
-				EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
-		if (IS_ERR(handle)) {
-			error = PTR_ERR(handle);
-			ext4_std_error(inode->i_sb, error);
-			goto out;
-		}
-		error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
-		ext4_journal_stop(handle);
-		if (error == -ENOSPC &&
-		    ext4_should_retry_alloc(inode->i_sb, &retries))
-			goto retry;
+	if (default_acl) {
+		error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+				       default_acl);
+		posix_acl_release(default_acl);
+	}
+	if (acl) {
+		if (!error)
+			error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
+					       acl);
+		posix_acl_release(acl);
 	}
-out:
-	posix_acl_release(clone);
-	return error;
-}
-
-/*
- * Extended attribute handlers
- */
-static size_t
-ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
-			   const char *name, size_t name_len)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return 0;
-	if (list && size <= list_len)
-		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
-	return size;
-}
-
-static size_t
-ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
-			    const char *name, size_t name_len)
-{
-	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return 0;
-	if (list && size <= list_len)
-		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
-	return size;
-}
-
-static int
-ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
-{
-	struct posix_acl *acl;
-	int error;
-
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return -EOPNOTSUPP;
-
-	acl = ext4_get_acl(inode, type);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-	if (acl == NULL)
-		return -ENODATA;
-	error = posix_acl_to_xattr(acl, buffer, size);
-	posix_acl_release(acl);
-
-	return error;
-}
-
-static int
-ext4_xattr_get_acl_access(struct inode *inode, const char *name,
-			  void *buffer, size_t size)
-{
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-
-static int
-ext4_xattr_get_acl_default(struct inode *inode, const char *name,
-			   void *buffer, size_t size)
-{
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-
-static int
-ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
-		   size_t size)
-{
-	handle_t *handle;
-	struct posix_acl *acl;
-	int error, retries = 0;
-
-	if (!test_opt(inode->i_sb, POSIX_ACL))
-		return -EOPNOTSUPP;
-	if (!is_owner_or_cap(inode))
-		return -EPERM;
-
-	if (value) {
-		acl = posix_acl_from_xattr(value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-		else if (acl) {
-			error = posix_acl_valid(acl);
-			if (error)
-				goto release_and_out;
-		}
-	} else
-		acl = NULL;
-
-retry:
-	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	error = ext4_set_acl(handle, inode, type, acl);
-	ext4_journal_stop(handle);
-	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-
-release_and_out:
-	posix_acl_release(acl);
 	return error;
 }
-
-static int
-ext4_xattr_set_acl_access(struct inode *inode, const char *name,
-			  const void *value, size_t size, int flags)
-{
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-
-static int
-ext4_xattr_set_acl_default(struct inode *inode, const char *name,
-			   const void *value, size_t size, int flags)
-{
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-	return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
-
-struct xattr_handler ext4_xattr_acl_access_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
-	.list	= ext4_xattr_list_acl_access,
-	.get	= ext4_xattr_get_acl_access,
-	.set	= ext4_xattr_set_acl_access,
-};
-
-struct xattr_handler ext4_xattr_acl_default_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
-	.list	= ext4_xattr_list_acl_default,
-	.get	= ext4_xattr_get_acl_default,
-	.set	= ext4_xattr_set_acl_default,
-};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 26a5c1abf14..da2c79577d7 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,31 +51,22 @@ static inline int ext4_acl_count(size_t size)
 	}
 }
 
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-
-/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
-   if the ACL has not been cached */
-#define EXT4_ACL_NOT_CACHED ((void *)-1)
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 
 /* acl.c */
-extern int ext4_permission (struct inode *, int, struct nameidata *);
-extern int ext4_acl_chmod (struct inode *);
-extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+struct posix_acl *ext4_get_acl(struct inode *inode, int type);
+int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 
-#else  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
-#define ext4_permission NULL
-
-static inline int
-ext4_acl_chmod(struct inode *inode)
-{
-	return 0;
-}
+#define ext4_get_acl NULL
+#define ext4_set_acl NULL
 
 static inline int
 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
 {
 	return 0;
 }
-#endif  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#endif  /* CONFIG_EXT4_FS_POSIX_ACL */
 
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0737e05ba3d..fca382037dd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -15,18 +15,40 @@
 #include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "mballoc.h"
 
-#include "group.h"
+#include <trace/events/ext4.h>
+
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+					    ext4_group_t block_group);
 /*
  * balloc.c contains the blocks allocation and deallocation routines
  */
 
 /*
- * Calculate the block group number and offset, given a block number
+ * Calculate block group number for a given block number
+ */
+ext4_group_t ext4_get_group_number(struct super_block *sb,
+				   ext4_fsblk_t block)
+{
+	ext4_group_t group;
+
+	if (test_opt2(sb, STD_GROUP_SIZE))
+		group = (block -
+			 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
+			(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
+	else
+		ext4_get_group_no_and_offset(sb, block, &group, NULL);
+	return group;
+}
+
+/*
+ * Calculate the block group number and offset into the block/cluster
+ * allocation bitmap, given a block number
  */
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 		ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
@@ -35,7 +57,8 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 	ext4_grpblk_t offset;
 
 	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
-	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
+		EXT4_SB(sb)->s_cluster_bits;
 	if (offsetp)
 		*offsetp = offset;
 	if (blockgrpp)
@@ -43,93 +66,195 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 }
 
-/* Initializes an uninitialized block bitmap if given, and returns the
- * number of blocks free in the group. */
-unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
-		 ext4_group_t block_group, struct ext4_group_desc *gdp)
+/*
+ * Check whether the 'block' lives within the 'block_group'. Returns 1 if so
+ * and 0 otherwise.
+ */
+static inline int ext4_block_in_group(struct super_block *sb,
+				      ext4_fsblk_t block,
+				      ext4_group_t block_group)
+{
+	ext4_group_t actual_group;
+
+	actual_group = ext4_get_group_number(sb, block);
+	return (actual_group == block_group) ? 1 : 0;
+}
+
+/* Return the number of clusters used for file system metadata; this
+ * represents the overhead needed by the file system.
+ */
+static unsigned ext4_num_overhead_clusters(struct super_block *sb,
+					   ext4_group_t block_group,
+					   struct ext4_group_desc *gdp)
 {
-	unsigned long start;
-	int bit, bit_max;
-	unsigned free_blocks, group_blocks;
+	unsigned num_clusters;
+	int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
+	ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
+	ext4_fsblk_t itbl_blk;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (bh) {
-		J_ASSERT_BH(bh, buffer_locked(bh));
-
-		/* If checksum is bad mark all blocks used to prevent allocation
-		 * essentially implementing a per-group read-only flag. */
-		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-			ext4_error(sb, __FUNCTION__,
-				  "Checksum bad for group %lu\n", block_group);
-			gdp->bg_free_blocks_count = 0;
-			gdp->bg_free_inodes_count = 0;
-			gdp->bg_itable_unused = 0;
-			memset(bh->b_data, 0xff, sb->s_blocksize);
-			return 0;
+	/* This is the number of clusters used by the superblock,
+	 * block group descriptors, and reserved block group
+	 * descriptor blocks */
+	num_clusters = ext4_num_base_meta_clusters(sb, block_group);
+
+	/*
+	 * For the allocation bitmaps and inode table, we first need
+	 * to check to see if the block is in the block group.  If it
+	 * is, then check to see if the cluster is already accounted
+	 * for in the clusters used for the base metadata cluster, or
+	 * if we can increment the base metadata cluster to include
+	 * that block.  Otherwise, we will have to track the cluster
+	 * used for the allocation bitmap or inode table explicitly.
+	 * Normally all of these blocks are contiguous, so the special
+	 * case handling shouldn't be necessary except for *very*
+	 * unusual file system layouts.
+	 */
+	if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
+		block_cluster = EXT4_B2C(sbi,
+					 ext4_block_bitmap(sb, gdp) - start);
+		if (block_cluster < num_clusters)
+			block_cluster = -1;
+		else if (block_cluster == num_clusters) {
+			num_clusters++;
+			block_cluster = -1;
 		}
-		memset(bh->b_data, 0, sb->s_blocksize);
 	}
 
-	/* Check for superblock and gdt backups in this group */
-	bit_max = ext4_bg_has_super(sb, block_group);
+	if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
+		inode_cluster = EXT4_B2C(sbi,
+					 ext4_inode_bitmap(sb, gdp) - start);
+		if (inode_cluster < num_clusters)
+			inode_cluster = -1;
+		else if (inode_cluster == num_clusters) {
+			num_clusters++;
+			inode_cluster = -1;
+		}
+	}
 
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
-	    block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
-			  sbi->s_desc_per_block) {
-		if (bit_max) {
-			bit_max += ext4_bg_num_gdb(sb, block_group);
-			bit_max +=
-				le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+	itbl_blk = ext4_inode_table(sb, gdp);
+	for (i = 0; i < sbi->s_itb_per_group; i++) {
+		if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
+			c = EXT4_B2C(sbi, itbl_blk + i - start);
+			if ((c < num_clusters) || (c == inode_cluster) ||
+			    (c == block_cluster) || (c == itbl_cluster))
+				continue;
+			if (c == num_clusters) {
+				num_clusters++;
+				continue;
+			}
+			num_clusters++;
+			itbl_cluster = c;
 		}
-	} else { /* For META_BG_BLOCK_GROUPS */
-		int group_rel = (block_group -
-				 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
-				EXT4_DESC_PER_BLOCK(sb);
-		if (group_rel == 0 || group_rel == 1 ||
-		    (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
-			bit_max += 1;
 	}
 
-	if (block_group == sbi->s_groups_count - 1) {
+	if (block_cluster != -1)
+		num_clusters++;
+	if (inode_cluster != -1)
+		num_clusters++;
+
+	return num_clusters;
+}
+
+static unsigned int num_clusters_in_group(struct super_block *sb,
+					  ext4_group_t block_group)
+{
+	unsigned int blocks;
+
+	if (block_group == ext4_get_groups_count(sb) - 1) {
 		/*
-		 * Even though mke2fs always initialize first and last group
-		 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
-		 * to make sure we calculate the right free blocks
+		 * Even though mke2fs always initializes the first and
+		 * last group, just in case some other tool was used,
+		 * we need to make sure we calculate the right free
+		 * blocks.
 		 */
-		group_blocks = ext4_blocks_count(sbi->s_es) -
-			le32_to_cpu(sbi->s_es->s_first_data_block) -
-			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
-	} else {
-		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
+		blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
+			ext4_group_first_block_no(sb, block_group);
+	} else
+		blocks = EXT4_BLOCKS_PER_GROUP(sb);
+	return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
+}
+
+/* Initializes an uninitialized block bitmap */
+static void ext4_init_block_bitmap(struct super_block *sb,
+				   struct buffer_head *bh,
+				   ext4_group_t block_group,
+				   struct ext4_group_desc *gdp)
+{
+	unsigned int bit, bit_max;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t start, tmp;
+	int flex_bg = 0;
+	struct ext4_group_info *grp;
+
+	J_ASSERT_BH(bh, buffer_locked(bh));
+
+	/* If checksum is bad mark all blocks used to prevent allocation
+	 * essentially implementing a per-group read-only flag. */
+	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+		ext4_error(sb, "Checksum bad for group %u", block_group);
+		grp = ext4_get_group_info(sb, block_group);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			int count;
+			count = ext4_free_inodes_count(sb, gdp);
+			percpu_counter_sub(&sbi->s_freeinodes_counter,
+					   count);
+		}
+		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return;
 	}
+	memset(bh->b_data, 0, sb->s_blocksize);
 
-	free_blocks = group_blocks - bit_max;
+	bit_max = ext4_num_base_meta_clusters(sb, block_group);
+	for (bit = 0; bit < bit_max; bit++)
+		ext4_set_bit(bit, bh->b_data);
 
-	if (bh) {
-		for (bit = 0; bit < bit_max; bit++)
-			ext4_set_bit(bit, bh->b_data);
+	start = ext4_group_first_block_no(sb, block_group);
 
-		start = block_group * EXT4_BLOCKS_PER_GROUP(sb) +
-			le32_to_cpu(sbi->s_es->s_first_data_block);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		flex_bg = 1;
 
-		/* Set bits for block and inode bitmaps, and inode table */
-		ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data);
-		ext4_set_bit(ext4_inode_bitmap(sb, gdp) - start, bh->b_data);
-		for (bit = (ext4_inode_table(sb, gdp) - start),
-		     bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++)
-			ext4_set_bit(bit, bh->b_data);
+	/* Set bits for block and inode bitmaps, and inode table */
+	tmp = ext4_block_bitmap(sb, gdp);
+	if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+		ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
 
-		/*
-		 * Also if the number of blocks within the group is
-		 * less than the blocksize * 8 ( which is the size
-		 * of bitmap ), set rest of the block bitmap to 1
-		 */
-		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
+	tmp = ext4_inode_bitmap(sb, gdp);
+	if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+		ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
+
+	tmp = ext4_inode_table(sb, gdp);
+	for (; tmp < ext4_inode_table(sb, gdp) +
+		     sbi->s_itb_per_group; tmp++) {
+		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+			ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
 	}
 
-	return free_blocks - sbi->s_itb_per_group - 2;
+	/*
+	 * Also if the number of blocks within the group is less than
+	 * the blocksize * 8 ( which is the size of bitmap ), set rest
+	 * of the block bitmap to 1
+	 */
+	ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
+			     sb->s_blocksize * 8, bh->b_data);
+	ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
+	ext4_group_desc_csum_set(sb, block_group, gdp);
 }
 
+/* Return the number of free blocks in a block group.  It is used when
+ * the block bitmap is uninitialized, so we can't just count the bits
+ * in the bitmap. */
+unsigned ext4_free_clusters_after_init(struct super_block *sb,
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
+{
+	return num_clusters_in_group(sb, block_group) - 
+		ext4_num_overhead_clusters(sb, block_group, gdp);
+}
 
 /*
  * The free blocks are managed by bitmaps.  A file system contains several
@@ -142,9 +267,6 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  * when a file system is mounted (see ext4_fill_super).
  */
 
-
-#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
-
 /**
  * ext4_get_group_desc() -- load group descriptor from disk
  * @sb:			super block
@@ -152,32 +274,29 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  * @bh:			pointer to the buffer head to store the block
  *			group descriptor
  */
-struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 					     ext4_group_t block_group,
-					     struct buffer_head ** bh)
+					     struct buffer_head **bh)
 {
-	unsigned long group_desc;
-	unsigned long offset;
-	struct ext4_group_desc * desc;
+	unsigned int group_desc;
+	unsigned int offset;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (block_group >= sbi->s_groups_count) {
-		ext4_error (sb, "ext4_get_group_desc",
-			    "block_group >= groups_count - "
-			    "block_group = %lu, groups_count = %lu",
-			    block_group, sbi->s_groups_count);
+	if (block_group >= ngroups) {
+		ext4_error(sb, "block_group >= groups_count - block_group = %u,"
+			   " groups_count = %u", block_group, ngroups);
 
 		return NULL;
 	}
-	smp_rmb();
 
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 	if (!sbi->s_group_desc[group_desc]) {
-		ext4_error (sb, "ext4_get_group_desc",
-			    "Group descriptor not loaded - "
-			    "block_group = %lu, group_desc = %lu, desc = %lu",
-			     block_group, group_desc, offset);
+		ext4_error(sb, "Group descriptor not loaded - "
+			   "block_group = %u, group_desc = %u, desc = %u",
+			   block_group, group_desc, offset);
 		return NULL;
 	}
 
@@ -189,14 +308,19 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 	return desc;
 }
 
-static int ext4_valid_block_bitmap(struct super_block *sb,
-					struct ext4_group_desc *desc,
-					unsigned int block_group,
-					struct buffer_head *bh)
+/*
+ * Return the block number which was discovered to be invalid, or 0 if
+ * the block bitmap is valid.
+ */
+static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
+					    struct ext4_group_desc *desc,
+					    ext4_group_t block_group,
+					    struct buffer_head *bh)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_grpblk_t offset;
 	ext4_grpblk_t next_zero_bit;
-	ext4_fsblk_t bitmap_blk;
+	ext4_fsblk_t blk;
 	ext4_fsblk_t group_first_block;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
@@ -206,43 +330,77 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
 		 * or it has to also read the block group where the bitmaps
 		 * are located to verify they are set.
 		 */
-		return 1;
+		return 0;
 	}
 	group_first_block = ext4_group_first_block_no(sb, block_group);
 
 	/* check whether block bitmap block number is set */
-	bitmap_blk = ext4_block_bitmap(sb, desc);
-	offset = bitmap_blk - group_first_block;
-	if (!ext4_test_bit(offset, bh->b_data))
+	blk = ext4_block_bitmap(sb, desc);
+	offset = blk - group_first_block;
+	if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
 		/* bad block bitmap */
-		goto err_out;
+		return blk;
 
 	/* check whether the inode bitmap block number is set */
-	bitmap_blk = ext4_inode_bitmap(sb, desc);
-	offset = bitmap_blk - group_first_block;
-	if (!ext4_test_bit(offset, bh->b_data))
+	blk = ext4_inode_bitmap(sb, desc);
+	offset = blk - group_first_block;
+	if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
 		/* bad block bitmap */
-		goto err_out;
+		return blk;
 
 	/* check whether the inode table block number is set */
-	bitmap_blk = ext4_inode_table(sb, desc);
-	offset = bitmap_blk - group_first_block;
+	blk = ext4_inode_table(sb, desc);
+	offset = blk - group_first_block;
 	next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
-				offset + EXT4_SB(sb)->s_itb_per_group,
-				offset);
-	if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
-		/* good bitmap for inode tables */
-		return 1;
-
-err_out:
-	ext4_error(sb, __FUNCTION__,
-			"Invalid block bitmap - "
-			"block_group = %d, block = %llu",
-			block_group, bitmap_blk);
+			EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group),
+			EXT4_B2C(sbi, offset));
+	if (next_zero_bit <
+	    EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group))
+		/* bad bitmap for inode tables */
+		return blk;
 	return 0;
 }
+
+static void ext4_validate_block_bitmap(struct super_block *sb,
+				       struct ext4_group_desc *desc,
+				       ext4_group_t block_group,
+				       struct buffer_head *bh)
+{
+	ext4_fsblk_t	blk;
+	struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (buffer_verified(bh))
+		return;
+
+	ext4_lock_group(sb, block_group);
+	blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+	if (unlikely(blk != 0)) {
+		ext4_unlock_group(sb, block_group);
+		ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+			   block_group, blk);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return;
+	}
+	if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+			desc, bh))) {
+		ext4_unlock_group(sb, block_group);
+		ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return;
+	}
+	set_buffer_verified(bh);
+	ext4_unlock_group(sb, block_group);
+}
+
 /**
- * read_block_bitmap()
+ * ext4_read_block_bitmap_nowait()
  * @sb:			super block
  * @block_group:	given block group
  *
@@ -252,10 +410,10 @@ err_out:
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 {
-	struct ext4_group_desc * desc;
-	struct buffer_head * bh = NULL;
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh;
 	ext4_fsblk_t bitmap_blk;
 
 	desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -264,1305 +422,161 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	bitmap_blk = ext4_block_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
-		ext4_error(sb, __FUNCTION__,
-			    "Cannot read block bitmap - "
-			    "block_group = %d, block_bitmap = %llu",
-			    (int)block_group, (unsigned long long)bitmap_blk);
+		ext4_error(sb, "Cannot get buffer for block bitmap - "
+			   "block_group = %u, block_bitmap = %llu",
+			   block_group, bitmap_blk);
 		return NULL;
 	}
-	if (bh_uptodate_or_lock(bh))
-		return bh;
 
+	if (bitmap_uptodate(bh))
+		goto verify;
+
+	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto verify;
+	}
+	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
+		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
 		return bh;
 	}
-	if (bh_submit_read(bh) < 0) {
-		put_bh(bh);
-		ext4_error(sb, __FUNCTION__,
-			    "Cannot read block bitmap - "
-			    "block_group = %d, block_bitmap = %llu",
-			    (int)block_group, (unsigned long long)bitmap_blk);
-		return NULL;
-	}
-	if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) {
-		put_bh(bh);
-		return NULL;
-	}
-
-	return bh;
-}
-/*
- * The reservation window structure operations
- * --------------------------------------------
- * Operations include:
- * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
- *
- * We use a red-black tree to represent per-filesystem reservation
- * windows.
- *
- */
-
-/**
- * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root:		root of per-filesystem reservation rb tree
- * @verbose:		verbose mode
- * @fn:			function which wishes to dump the reservation map
- *
- * If verbose is turned on, it will print the whole block reservation
- * windows(start, end).	Otherwise, it will only print out the "bad" windows,
- * those windows that overlap with their immediate neighbors.
- */
-#if 1
-static void __rsv_window_dump(struct rb_root *root, int verbose,
-			      const char *fn)
-{
-	struct rb_node *n;
-	struct ext4_reserve_window_node *rsv, *prev;
-	int bad;
-
-restart:
-	n = rb_first(root);
-	bad = 0;
-	prev = NULL;
-
-	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
-	while (n) {
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-		if (verbose)
-			printk("reservation window 0x%p "
-			       "start:  %llu, end:  %llu\n",
-			       rsv, rsv->rsv_start, rsv->rsv_end);
-		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
-			printk("Bad reservation %p (start >= end)\n",
-			       rsv);
-			bad = 1;
-		}
-		if (prev && prev->rsv_end >= rsv->rsv_start) {
-			printk("Bad reservation %p (prev->end >= start)\n",
-			       rsv);
-			bad = 1;
-		}
-		if (bad) {
-			if (!verbose) {
-				printk("Restarting reservation walk in verbose mode\n");
-				verbose = 1;
-				goto restart;
-			}
-		}
-		n = rb_next(n);
-		prev = rsv;
-	}
-	printk("Window map complete.\n");
-	if (bad)
-		BUG();
-}
-#define rsv_window_dump(root, verbose) \
-	__rsv_window_dump((root), (verbose), __FUNCTION__)
-#else
-#define rsv_window_dump(root, verbose) do {} while (0)
-#endif
-
-/**
- * goal_in_my_reservation()
- * @rsv:		inode's reservation window
- * @grp_goal:		given goal block relative to the allocation block group
- * @group:		the current allocation block group
- * @sb:			filesystem super block
- *
- * Test if the given goal block (group relative) is within the file's
- * own block reservation window range.
- *
- * If the reservation window is outside the goal allocation group, return 0;
- * grp_goal (given goal block) could be -1, which means no specific
- * goal block. In this case, always return 1.
- * If the goal block is within the reservation window, return 1;
- * otherwise, return 0;
- */
-static int
-goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
-			ext4_group_t group, struct super_block *sb)
-{
-	ext4_fsblk_t group_first_block, group_last_block;
-
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	if ((rsv->_rsv_start > group_last_block) ||
-	    (rsv->_rsv_end < group_first_block))
-		return 0;
-	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
-		|| (grp_goal + group_first_block > rsv->_rsv_end)))
-		return 0;
-	return 1;
-}
-
-/**
- * search_reserve_window()
- * @rb_root:		root of reservation tree
- * @goal:		target allocation block
- *
- * Find the reserved window which includes the goal, or the previous one
- * if the goal is not in any window.
- * Returns NULL if there are no windows or if all windows start after the goal.
- */
-static struct ext4_reserve_window_node *
-search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
-{
-	struct rb_node *n = root->rb_node;
-	struct ext4_reserve_window_node *rsv;
-
-	if (!n)
-		return NULL;
-
-	do {
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-
-		if (goal < rsv->rsv_start)
-			n = n->rb_left;
-		else if (goal > rsv->rsv_end)
-			n = n->rb_right;
-		else
-			return rsv;
-	} while (n);
-	/*
-	 * We've fallen off the end of the tree: the goal wasn't inside
-	 * any particular node.  OK, the previous node must be to one
-	 * side of the interval containing the goal.  If it's the RHS,
-	 * we need to back up one.
-	 */
-	if (rsv->rsv_start > goal) {
-		n = rb_prev(&rsv->rsv_node);
-		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-	}
-	return rsv;
-}
-
-/**
- * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
- * @sb:			super block
- * @rsv:		reservation window to add
- *
- * Must be called with rsv_lock hold.
- */
-void ext4_rsv_window_add(struct super_block *sb,
-		    struct ext4_reserve_window_node *rsv)
-{
-	struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
-	struct rb_node *node = &rsv->rsv_node;
-	ext4_fsblk_t start = rsv->rsv_start;
-
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
-	struct ext4_reserve_window_node *this;
-
-	while (*p)
-	{
-		parent = *p;
-		this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
-
-		if (start < this->rsv_start)
-			p = &(*p)->rb_left;
-		else if (start > this->rsv_end)
-			p = &(*p)->rb_right;
-		else {
-			rsv_window_dump(root, 1);
-			BUG();
-		}
-	}
-
-	rb_link_node(node, parent, p);
-	rb_insert_color(node, root);
-}
-
-/**
- * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
- * @sb:			super block
- * @rsv:		reservation window to remove
- *
- * Mark the block reservation window as not allocated, and unlink it
- * from the filesystem reservation window rb tree. Must be called with
- * rsv_lock hold.
- */
-static void rsv_window_remove(struct super_block *sb,
-			      struct ext4_reserve_window_node *rsv)
-{
-	rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	rsv->rsv_alloc_hit = 0;
-	rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
-}
-
-/*
- * rsv_is_empty() -- Check if the reservation window is allocated.
- * @rsv:		given reservation window to check
- *
- * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
- */
-static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
-{
-	/* a valid reservation end block could not be 0 */
-	return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-}
-
-/**
- * ext4_init_block_alloc_info()
- * @inode:		file inode structure
- *
- * Allocate and initialize the	reservation window structure, and
- * link the window to the ext4 inode structure at last
- *
- * The reservation window structure is only dynamically allocated
- * and linked to ext4 inode the first time the open file
- * needs a new block. So, before every ext4_new_block(s) call, for
- * regular files, we should check whether the reservation window
- * structure exists or not. In the latter case, this function is called.
- * Fail to do so will result in block reservation being turned off for that
- * open file.
- *
- * This function is called from ext4_get_blocks_handle(), also called
- * when setting the reservation window size through ioctl before the file
- * is open for write (needs block allocation).
- *
- * Needs down_write(i_data_sem) protection prior to call this function.
- */
-void ext4_init_block_alloc_info(struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-	struct super_block *sb = inode->i_sb;
-
-	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
-	if (block_i) {
-		struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
-
-		rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-		rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-
-		/*
-		 * if filesystem is mounted with NORESERVATION, the goal
-		 * reservation window size is set to zero to indicate
-		 * block reservation is off
-		 */
-		if (!test_opt(sb, RESERVATION))
-			rsv->rsv_goal_size = 0;
-		else
-			rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
-		rsv->rsv_alloc_hit = 0;
-		block_i->last_alloc_logical_block = 0;
-		block_i->last_alloc_physical_block = 0;
-	}
-	ei->i_block_alloc_info = block_i;
-}
-
-/**
- * ext4_discard_reservation()
- * @inode:		inode
- *
- * Discard(free) block reservation window on last file close, or truncate
- * or at last iput().
- *
- * It is being called in three cases:
- *	ext4_release_file(): last writer close the file
- *	ext4_clear_inode(): last iput(), when nobody link to this file.
- *	ext4_truncate(): when the block indirect map is about to change.
- *
- */
-void ext4_discard_reservation(struct inode *inode)
-{
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
-	struct ext4_reserve_window_node *rsv;
-	spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
-
-	ext4_mb_discard_inode_preallocations(inode);
-
-	if (!block_i)
-		return;
-
-	rsv = &block_i->rsv_window_node;
-	if (!rsv_is_empty(&rsv->rsv_window)) {
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&rsv->rsv_window))
-			rsv_window_remove(inode->i_sb, rsv);
-		spin_unlock(rsv_lock);
-	}
-}
-
-/**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
- * @handle:			handle to this transaction
- * @sb:				super block
- * @block:			start physcial block to free
- * @count:			number of blocks to free
- * @pdquot_freed_blocks:	pointer to quota
- */
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-			 ext4_fsblk_t block, unsigned long count,
-			 unsigned long *pdquot_freed_blocks)
-{
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gd_bh;
-	ext4_group_t block_group;
-	ext4_grpblk_t bit;
-	unsigned long i;
-	unsigned long overflow;
-	struct ext4_group_desc * desc;
-	struct ext4_super_block * es;
-	struct ext4_sb_info *sbi;
-	int err = 0, ret;
-	ext4_grpblk_t group_freed;
-
-	*pdquot_freed_blocks = 0;
-	sbi = EXT4_SB(sb);
-	es = sbi->s_es;
-	if (block < le32_to_cpu(es->s_first_data_block) ||
-	    block + count < block ||
-	    block + count > ext4_blocks_count(es)) {
-		ext4_error (sb, "ext4_free_blocks",
-			    "Freeing blocks not in datazone - "
-			    "block = %llu, count = %lu", block, count);
-		goto error_return;
-	}
-
-	ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
-
-do_more:
-	overflow = 0;
-	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
-	/*
-	 * Check to see if we are freeing blocks across a group
-	 * boundary.
-	 */
-	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
-		count -= overflow;
-	}
-	brelse(bitmap_bh);
-	bitmap_bh = read_block_bitmap(sb, block_group);
-	if (!bitmap_bh)
-		goto error_return;
-	desc = ext4_get_group_desc (sb, block_group, &gd_bh);
-	if (!desc)
-		goto error_return;
-
-	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
-	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
-	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
-	    in_range(block + count - 1, ext4_inode_table(sb, desc),
-		     sbi->s_itb_per_group)) {
-		ext4_error (sb, "ext4_free_blocks",
-			    "Freeing blocks in system zones - "
-			    "Block = %llu, count = %lu",
-			    block, count);
-		goto error_return;
-	}
-
-	/*
-	 * We are about to start releasing blocks in the bitmap,
-	 * so we need undo access.
-	 */
-	/* @@@ check errors */
-	BUFFER_TRACE(bitmap_bh, "getting undo access");
-	err = ext4_journal_get_undo_access(handle, bitmap_bh);
-	if (err)
-		goto error_return;
-
-	/*
-	 * We are about to modify some metadata.  Call the journal APIs
-	 * to unshare ->b_data if a currently-committing transaction is
-	 * using it
-	 */
-	BUFFER_TRACE(gd_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, gd_bh);
-	if (err)
-		goto error_return;
-
-	jbd_lock_bh_state(bitmap_bh);
-
-	for (i = 0, group_freed = 0; i < count; i++) {
+	ext4_unlock_group(sb, block_group);
+	if (buffer_uptodate(bh)) {
 		/*
-		 * An HJ special.  This is expensive...
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
 		 */
-#ifdef CONFIG_JBD2_DEBUG
-		jbd_unlock_bh_state(bitmap_bh);
-		{
-			struct buffer_head *debug_bh;
-			debug_bh = sb_find_get_block(sb, block + i);
-			if (debug_bh) {
-				BUFFER_TRACE(debug_bh, "Deleted!");
-				if (!bh2jh(bitmap_bh)->b_committed_data)
-					BUFFER_TRACE(debug_bh,
-						"No commited data in bitmap");
-				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
-				__brelse(debug_bh);
-			}
-		}
-		jbd_lock_bh_state(bitmap_bh);
-#endif
-		if (need_resched()) {
-			jbd_unlock_bh_state(bitmap_bh);
-			cond_resched();
-			jbd_lock_bh_state(bitmap_bh);
-		}
-		/* @@@ This prevents newly-allocated data from being
-		 * freed and then reallocated within the same
-		 * transaction.
-		 *
-		 * Ideally we would want to allow that to happen, but to
-		 * do so requires making jbd2_journal_forget() capable of
-		 * revoking the queued write of a data block, which
-		 * implies blocking on the journal lock.  *forget()
-		 * cannot block due to truncate races.
-		 *
-		 * Eventually we can fix this by making jbd2_journal_forget()
-		 * return a status indicating whether or not it was able
-		 * to revoke the buffer.  On successful revoke, it is
-		 * safe not to set the allocation bit in the committed
-		 * bitmap, because we know that there is no outstanding
-		 * activity on the buffer any more and so it is safe to
-		 * reallocate it.
-		 */
-		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
-		J_ASSERT_BH(bitmap_bh,
-				bh2jh(bitmap_bh)->b_committed_data != NULL);
-		ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
-				bh2jh(bitmap_bh)->b_committed_data);
-
-		/*
-		 * We clear the bit in the bitmap after setting the committed
-		 * data bit, because this is the reverse order to that which
-		 * the allocator uses.
-		 */
-		BUFFER_TRACE(bitmap_bh, "clear bit");
-		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
-						bit + i, bitmap_bh->b_data)) {
-			jbd_unlock_bh_state(bitmap_bh);
-			ext4_error(sb, __FUNCTION__,
-				   "bit already cleared for block %llu",
-				   (ext4_fsblk_t)(block + i));
-			jbd_lock_bh_state(bitmap_bh);
-			BUFFER_TRACE(bitmap_bh, "bit already cleared");
-		} else {
-			group_freed++;
-		}
-	}
-	jbd_unlock_bh_state(bitmap_bh);
-
-	spin_lock(sb_bgl_lock(sbi, block_group));
-	desc->bg_free_blocks_count =
-		cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
-			group_freed);
-	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_add(&sbi->s_freeblocks_counter, count);
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
-
-	/* And the group descriptor block */
-	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_journal_dirty_metadata(handle, gd_bh);
-	if (!err) err = ret;
-	*pdquot_freed_blocks += group_freed;
-
-	if (overflow && !err) {
-		block += count;
-		count = overflow;
-		goto do_more;
-	}
-	sb->s_dirt = 1;
-error_return:
-	brelse(bitmap_bh);
-	ext4_std_error(sb, err);
-	return;
-}
-
-/**
- * ext4_free_blocks() -- Free given blocks and update quota
- * @handle:		handle for this transaction
- * @inode:		inode
- * @block:		start physical block to free
- * @count:		number of blocks to count
- * @metadata: 		Are these metadata blocks
- */
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t block, unsigned long count,
-			int metadata)
-{
-	struct super_block * sb;
-	unsigned long dquot_freed_blocks;
-
-	/* this isn't the right place to decide whether block is metadata
-	 * inode.c/extents.c knows better, but for safety ... */
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
-			ext4_should_journal_data(inode))
-		metadata = 1;
-
-	sb = inode->i_sb;
-
-	if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
-		ext4_free_blocks_sb(handle, sb, block, count,
-						&dquot_freed_blocks);
-	else
-		ext4_mb_free_blocks(handle, inode, block, count,
-						metadata, &dquot_freed_blocks);
-	if (dquot_freed_blocks)
-		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
-	return;
-}
-
-/**
- * ext4_test_allocatable()
- * @nr:			given allocation block group
- * @bh:			bufferhead contains the bitmap of the given block group
- *
- * For ext4 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy.  This
- * prevents deletes from freeing up the page for reuse until we have
- * committed the delete transaction.
- *
- * If we didn't do this, then deleting something and reallocating it as
- * data would allow the old block to be overwritten before the
- * transaction committed (because we force data to disk before commit).
- * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
- *
- * @@@ We may want to make this allocation behaviour conditional on
- * data-writes at some point, and disable it for metadata allocations or
- * sync-data inodes.
- */
-static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
-{
-	int ret;
-	struct journal_head *jh = bh2jh(bh);
-
-	if (ext4_test_bit(nr, bh->b_data))
-		return 0;
-
-	jbd_lock_bh_state(bh);
-	if (!jh->b_committed_data)
-		ret = 1;
-	else
-		ret = !ext4_test_bit(nr, jh->b_committed_data);
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
-
-/**
- * bitmap_search_next_usable_block()
- * @start:		the starting block (group relative) of the search
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) of the reservation
- *
- * The bitmap search --- search forward alternately through the actual
- * bitmap on disk and the last-committed copy in journal, until we find a
- * bit free in both bitmaps.
- */
-static ext4_grpblk_t
-bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-					ext4_grpblk_t maxblocks)
-{
-	ext4_grpblk_t next;
-	struct journal_head *jh = bh2jh(bh);
-
-	while (start < maxblocks) {
-		next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
-		if (next >= maxblocks)
-			return -1;
-		if (ext4_test_allocatable(next, bh))
-			return next;
-		jbd_lock_bh_state(bh);
-		if (jh->b_committed_data)
-			start = ext4_find_next_zero_bit(jh->b_committed_data,
-							maxblocks, next);
-		jbd_unlock_bh_state(bh);
-	}
-	return -1;
-}
-
-/**
- * find_next_usable_block()
- * @start:		the starting block (group relative) to find next
- *			allocatable block in bitmap.
- * @bh:			bufferhead contains the block group bitmap
- * @maxblocks:		the ending block (group relative) for the search
- *
- * Find an allocatable block in a bitmap.  We honor both the bitmap and
- * its last-committed copy (if that exists), and perform the "most
- * appropriate allocation" algorithm of looking for a free block near
- * the initial goal; then for a free byte somewhere in the bitmap; then
- * for any free bit in the bitmap.
- */
-static ext4_grpblk_t
-find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
-			ext4_grpblk_t maxblocks)
-{
-	ext4_grpblk_t here, next;
-	char *p, *r;
-
-	if (start > 0) {
-		/*
-		 * The goal was occupied; search forward for a free
-		 * block within the next XX blocks.
-		 *
-		 * end_goal is more or less random, but it has to be
-		 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
-		 * next 64-bit boundary is simple..
-		 */
-		ext4_grpblk_t end_goal = (start + 63) & ~63;
-		if (end_goal > maxblocks)
-			end_goal = maxblocks;
-		here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
-		if (here < end_goal && ext4_test_allocatable(here, bh))
-			return here;
-		ext4_debug("Bit not found near goal\n");
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		goto verify;
 	}
-
-	here = start;
-	if (here < 0)
-		here = 0;
-
-	p = ((char *)bh->b_data) + (here >> 3);
-	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-	next = (r - ((char *)bh->b_data)) << 3;
-
-	if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
-		return next;
-
 	/*
-	 * The bitmap search --- search forward alternately through the actual
-	 * bitmap and the last-committed copy until we find a bit free in
-	 * both
+	 * submit the buffer_head for reading
 	 */
-	here = bitmap_search_next_usable_block(here, bh, maxblocks);
-	return here;
+	set_buffer_new(bh);
+	trace_ext4_read_block_bitmap_load(sb, block_group);
+	bh->b_end_io = ext4_end_bitmap_read;
+	get_bh(bh);
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
+	return bh;
+verify:
+	ext4_validate_block_bitmap(sb, desc, block_group, bh);
+	if (buffer_verified(bh))
+		return bh;
+	put_bh(bh);
+	return NULL;
 }
 
-/**
- * claim_block()
- * @block:		the free block (group relative) to allocate
- * @bh:			the bufferhead containts the block group bitmap
- *
- * We think we can allocate this block in this bitmap.  Try to set the bit.
- * If that succeeds then check that nobody has allocated and then freed the
- * block since we saw that is was not marked in b_committed_data.  If it _was_
- * allocated and freed then clear the bit in the bitmap again and return
- * zero (failure).
- */
-static inline int
-claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
+/* Returns 0 on success, 1 on error */
+int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
+			   struct buffer_head *bh)
 {
-	struct journal_head *jh = bh2jh(bh);
-	int ret;
+	struct ext4_group_desc *desc;
 
-	if (ext4_set_bit_atomic(lock, block, bh->b_data))
+	if (!buffer_new(bh))
 		return 0;
-	jbd_lock_bh_state(bh);
-	if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
-		ext4_clear_bit_atomic(lock, block, bh->b_data);
-		ret = 0;
-	} else {
-		ret = 1;
-	}
-	jbd_unlock_bh_state(bh);
-	return ret;
-}
-
-/**
- * ext4_try_to_allocate()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
- * @my_rsv:		reservation window
- *
- * Attempt to allocate blocks within a give range. Set the range of allocation
- * first, then find the first free bit(s) from the bitmap (within the range),
- * and at last, allocate the blocks by claiming the found free bit as allocated.
- *
- * To set the range of this allocation:
- *	if there is a reservation window, only try to allocate block(s) from the
- *	file's own reservation window;
- *	Otherwise, the allocation range starts from the give goal block, ends at
- *	the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap.  In that case we must release write access to the old one via
- * ext4_journal_release_buffer(), else we'll run out of credits.
- */
-static ext4_grpblk_t
-ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
-			ext4_group_t group, struct buffer_head *bitmap_bh,
-			ext4_grpblk_t grp_goal, unsigned long *count,
-			struct ext4_reserve_window *my_rsv)
-{
-	ext4_fsblk_t group_first_block;
-	ext4_grpblk_t start, end;
-	unsigned long num = 0;
-
-	/* we do allocation within the reservation window if we have a window */
-	if (my_rsv) {
-		group_first_block = ext4_group_first_block_no(sb, group);
-		if (my_rsv->_rsv_start >= group_first_block)
-			start = my_rsv->_rsv_start - group_first_block;
-		else
-			/* reservation window cross group boundary */
-			start = 0;
-		end = my_rsv->_rsv_end - group_first_block + 1;
-		if (end > EXT4_BLOCKS_PER_GROUP(sb))
-			/* reservation window crosses group boundary */
-			end = EXT4_BLOCKS_PER_GROUP(sb);
-		if ((start <= grp_goal) && (grp_goal < end))
-			start = grp_goal;
-		else
-			grp_goal = -1;
-	} else {
-		if (grp_goal > 0)
-			start = grp_goal;
-		else
-			start = 0;
-		end = EXT4_BLOCKS_PER_GROUP(sb);
-	}
-
-	BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
-
-repeat:
-	if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
-		grp_goal = find_next_usable_block(start, bitmap_bh, end);
-		if (grp_goal < 0)
-			goto fail_access;
-		if (!my_rsv) {
-			int i;
-
-			for (i = 0; i < 7 && grp_goal > start &&
-					ext4_test_allocatable(grp_goal - 1,
-								bitmap_bh);
-					i++, grp_goal--)
-				;
-		}
-	}
-	start = grp_goal;
-
-	if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-		grp_goal, bitmap_bh)) {
-		/*
-		 * The block was allocated by another thread, or it was
-		 * allocated and then freed by another thread
-		 */
-		start++;
-		grp_goal++;
-		if (start >= end)
-			goto fail_access;
-		goto repeat;
-	}
-	num++;
-	grp_goal++;
-	while (num < *count && grp_goal < end
-		&& ext4_test_allocatable(grp_goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT4_SB(sb), group),
-				grp_goal, bitmap_bh)) {
-		num++;
-		grp_goal++;
-	}
-	*count = num;
-	return grp_goal - num;
-fail_access:
-	*count = num;
-	return -1;
-}
-
-/**
- *	find_next_reservable_window():
- *		find a reservable space within the given range.
- *		It does not allocate the reservation window for now:
- *		alloc_new_reservation() will do the work later.
- *
- *	@search_head: the head of the searching list;
- *		This is not necessarily the list head of the whole filesystem
- *
- *		We have both head and start_block to assist the search
- *		for the reservable space. The list starts from head,
- *		but we will shift to the place where start_block is,
- *		then start from there, when looking for a reservable space.
- *
- *	@size: the target new reservation window size
- *
- *	@group_first_block: the first block we consider to start
- *			the real search from
- *
- *	@last_block:
- *		the maximum block number that our goal reservable space
- *		could start from. This is normally the last block in this
- *		group. The search will end when we found the start of next
- *		possible reservable space is out of this boundary.
- *		This could handle the cross boundary reservation window
- *		request.
- *
- *	basically we search from the given range, rather than the whole
- *	reservation double linked list, (start_block, last_block)
- *	to find a free region that is of my size and has not
- *	been reserved.
- *
- */
-static int find_next_reservable_window(
-				struct ext4_reserve_window_node *search_head,
-				struct ext4_reserve_window_node *my_rsv,
-				struct super_block * sb,
-				ext4_fsblk_t start_block,
-				ext4_fsblk_t last_block)
-{
-	struct rb_node *next;
-	struct ext4_reserve_window_node *rsv, *prev;
-	ext4_fsblk_t cur;
-	int size = my_rsv->rsv_goal_size;
-
-	/* TODO: make the start of the reservation window byte-aligned */
-	/* cur = *start_block & ~7;*/
-	cur = start_block;
-	rsv = search_head;
-	if (!rsv)
-		return -1;
-
-	while (1) {
-		if (cur <= rsv->rsv_end)
-			cur = rsv->rsv_end + 1;
-
-		/* TODO?
-		 * in the case we could not find a reservable space
-		 * that is what is expected, during the re-search, we could
-		 * remember what's the largest reservable space we could have
-		 * and return that one.
-		 *
-		 * For now it will fail if we could not find the reservable
-		 * space with expected-size (or more)...
-		 */
-		if (cur > last_block)
-			return -1;		/* fail */
-
-		prev = rsv;
-		next = rb_next(&rsv->rsv_node);
-		rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
-
-		/*
-		 * Reached the last reservation, we can just append to the
-		 * previous one.
-		 */
-		if (!next)
-			break;
-
-		if (cur + size <= rsv->rsv_start) {
-			/*
-			 * Found a reserveable space big enough.  We could
-			 * have a reservation across the group boundary here
-			 */
-			break;
-		}
-	}
-	/*
-	 * we come here either :
-	 * when we reach the end of the whole list,
-	 * and there is empty reservable space after last entry in the list.
-	 * append it to the end of the list.
-	 *
-	 * or we found one reservable space in the middle of the list,
-	 * return the reservation window that we could append to.
-	 * succeed.
-	 */
-
-	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
-		rsv_window_remove(sb, my_rsv);
-
-	/*
-	 * Let's book the whole avaliable window for now.  We will check the
-	 * disk bitmap later and then, if there are free blocks then we adjust
-	 * the window size if it's larger than requested.
-	 * Otherwise, we will remove this node from the tree next time
-	 * call find_next_reservable_window.
-	 */
-	my_rsv->rsv_start = cur;
-	my_rsv->rsv_end = cur + size - 1;
-	my_rsv->rsv_alloc_hit = 0;
-
-	if (prev != my_rsv)
-		ext4_rsv_window_add(sb, my_rsv);
-
-	return 0;
-}
-
-/**
- *	alloc_new_reservation()--allocate a new reservation window
- *
- *		To make a new reservation, we search part of the filesystem
- *		reservation list (the list that inside the group). We try to
- *		allocate a new reservation window near the allocation goal,
- *		or the beginning of the group, if there is no goal.
- *
- *		We first find a reservable space after the goal, then from
- *		there, we check the bitmap for the first free block after
- *		it. If there is no free block until the end of group, then the
- *		whole group is full, we failed. Otherwise, check if the free
- *		block is inside the expected reservable space, if so, we
- *		succeed.
- *		If the first free block is outside the reservable space, then
- *		start from the first free block, we search for next available
- *		space, and go on.
- *
- *	on succeed, a new reservation will be found and inserted into the list
- *	It contains at least one free block, and it does not overlap with other
- *	reservation windows.
- *
- *	failed: we failed to find a reservation window in this group
- *
- *	@rsv: the reservation
- *
- *	@grp_goal: The goal (group-relative).  It is where the search for a
- *		free reservable space should start from.
- *		if we have a grp_goal(grp_goal >0 ), then start from there,
- *		no grp_goal(grp_goal = -1), we start from the first block
- *		of the group.
- *
- *	@sb: the super block
- *	@group: the group we are trying to allocate in
- *	@bitmap_bh: the block group block bitmap
- *
- */
-static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
-		ext4_grpblk_t grp_goal, struct super_block *sb,
-		ext4_group_t group, struct buffer_head *bitmap_bh)
-{
-	struct ext4_reserve_window_node *search_head;
-	ext4_fsblk_t group_first_block, group_end_block, start_block;
-	ext4_grpblk_t first_free_block;
-	struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
-	unsigned long size;
-	int ret;
-	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
-	if (grp_goal < 0)
-		start_block = group_first_block;
-	else
-		start_block = grp_goal + group_first_block;
-
-	size = my_rsv->rsv_goal_size;
-
-	if (!rsv_is_empty(&my_rsv->rsv_window)) {
-		/*
-		 * if the old reservation is cross group boundary
-		 * and if the goal is inside the old reservation window,
-		 * we will come here when we just failed to allocate from
-		 * the first part of the window. We still have another part
-		 * that belongs to the next group. In this case, there is no
-		 * point to discard our window and try to allocate a new one
-		 * in this group(which will fail). we should
-		 * keep the reservation window, just simply move on.
-		 *
-		 * Maybe we could shift the start block of the reservation
-		 * window to the first block of next group.
-		 */
-
-		if ((my_rsv->rsv_start <= group_end_block) &&
-				(my_rsv->rsv_end > group_end_block) &&
-				(start_block >= my_rsv->rsv_start))
-			return -1;
-
-		if ((my_rsv->rsv_alloc_hit >
-		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
-			/*
-			 * if the previously allocation hit ratio is
-			 * greater than 1/2, then we double the size of
-			 * the reservation window the next time,
-			 * otherwise we keep the same size window
-			 */
-			size = size * 2;
-			if (size > EXT4_MAX_RESERVE_BLOCKS)
-				size = EXT4_MAX_RESERVE_BLOCKS;
-			my_rsv->rsv_goal_size= size;
-		}
-	}
-
-	spin_lock(rsv_lock);
-	/*
-	 * shift the search start to the window near the goal block
-	 */
-	search_head = search_reserve_window(fs_rsv_root, start_block);
-
-	/*
-	 * find_next_reservable_window() simply finds a reservable window
-	 * inside the given range(start_block, group_end_block).
-	 *
-	 * To make sure the reservation window has a free bit inside it, we
-	 * need to check the bitmap after we found a reservable window.
-	 */
-retry:
-	ret = find_next_reservable_window(search_head, my_rsv, sb,
-						start_block, group_end_block);
-
-	if (ret == -1) {
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;
-	}
-
-	/*
-	 * On success, find_next_reservable_window() returns the
-	 * reservation window where there is a reservable space after it.
-	 * Before we reserve this reservable space, we need
-	 * to make sure there is at least a free block inside this region.
-	 *
-	 * searching the first free bit on the block bitmap and copy of
-	 * last committed bitmap alternatively, until we found a allocatable
-	 * block. Search start from the start block of the reservable space
-	 * we just found.
-	 */
-	spin_unlock(rsv_lock);
-	first_free_block = bitmap_search_next_usable_block(
-			my_rsv->rsv_start - group_first_block,
-			bitmap_bh, group_end_block - group_first_block + 1);
-
-	if (first_free_block < 0) {
-		/*
-		 * no free block left on the bitmap, no point
-		 * to reserve the space. return failed.
-		 */
-		spin_lock(rsv_lock);
-		if (!rsv_is_empty(&my_rsv->rsv_window))
-			rsv_window_remove(sb, my_rsv);
-		spin_unlock(rsv_lock);
-		return -1;		/* failed */
+	desc = ext4_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return 1;
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh)) {
+		ext4_error(sb, "Cannot read block bitmap - "
+			   "block_group = %u, block_bitmap = %llu",
+			   block_group, (unsigned long long) bh->b_blocknr);
+		return 1;
 	}
-
-	start_block = first_free_block + group_first_block;
-	/*
-	 * check if the first free block is within the
-	 * free space we just reserved
-	 */
-	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
-		return 0;		/* success */
-	/*
-	 * if the first free bit we found is out of the reservable space
-	 * continue search for next reservable space,
-	 * start from where the free block is,
-	 * we also shift the list head to where we stopped last time
-	 */
-	search_head = my_rsv;
-	spin_lock(rsv_lock);
-	goto retry;
+	clear_buffer_new(bh);
+	/* Panic or remount fs read-only if block bitmap is invalid */
+	ext4_validate_block_bitmap(sb, desc, block_group, bh);
+	/* ...but check for error just in case errors=continue. */
+	return !buffer_verified(bh);
 }
 
-/**
- * try_to_extend_reservation()
- * @my_rsv:		given reservation window
- * @sb:			super block
- * @size:		the delta to extend
- *
- * Attempt to expand the reservation window large enough to have
- * required number of free blocks
- *
- * Since ext4_try_to_allocate() will always allocate blocks within
- * the reservation window range, if the window size is too small,
- * multiple blocks allocation has to stop at the end of the reservation
- * window. To make this more efficient, given the total number of
- * blocks needed and the current size of the window, we try to
- * expand the reservation window size if necessary on a best-effort
- * basis before ext4_new_blocks() tries to allocate blocks,
- */
-static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
-			struct super_block *sb, int size)
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
-	struct ext4_reserve_window_node *next_rsv;
-	struct rb_node *next;
-	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
-	if (!spin_trylock(rsv_lock))
-		return;
-
-	next = rb_next(&my_rsv->rsv_node);
+	struct buffer_head *bh;
 
-	if (!next)
-		my_rsv->rsv_end += size;
-	else {
-		next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
-
-		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
-			my_rsv->rsv_end += size;
-		else
-			my_rsv->rsv_end = next_rsv->rsv_start - 1;
+	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	if (!bh)
+		return NULL;
+	if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+		put_bh(bh);
+		return NULL;
 	}
-	spin_unlock(rsv_lock);
+	return bh;
 }
 
 /**
- * ext4_try_to_allocate_with_rsv()
- * @sb:			superblock
- * @handle:		handle to this transaction
- * @group:		given allocation block group
- * @bitmap_bh:		bufferhead holds the block bitmap
- * @grp_goal:		given target block within the group
- * @count:		target number of blocks to allocate
- * @my_rsv:		reservation window
- * @errp:		pointer to store the error code
- *
- * This is the main function used to allocate a new block and its reservation
- * window.
- *
- * Each time when a new block allocation is need, first try to allocate from
- * its own reservation.  If it does not have a reservation window, instead of
- * looking for a free bit on bitmap first, then look up the reservation list to
- * see if it is inside somebody else's reservation window, we try to allocate a
- * reservation window for it starting from the goal first. Then do the block
- * allocation within the reservation window.
- *
- * This will avoid keeping on searching the reservation list again and
- * again when somebody is looking for a free block (without
- * reservation), and there are lots of free blocks, but they are all
- * being reserved.
- *
- * We use a red-black tree for the per-filesystem reservation list.
+ * ext4_has_free_clusters()
+ * @sbi:	in-core super block structure.
+ * @nclusters:	number of needed blocks
+ * @flags:	flags from ext4_mb_new_blocks()
  *
+ * Check if filesystem has nclusters free & available for allocation.
+ * On success return 1, return 0 on failure.
  */
-static ext4_grpblk_t
-ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
-			ext4_group_t group, struct buffer_head *bitmap_bh,
-			ext4_grpblk_t grp_goal,
-			struct ext4_reserve_window_node * my_rsv,
-			unsigned long *count, int *errp)
+static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
+				  s64 nclusters, unsigned int flags)
 {
-	ext4_fsblk_t group_first_block, group_last_block;
-	ext4_grpblk_t ret = 0;
-	int fatal;
-	unsigned long num = *count;
+	s64 free_clusters, dirty_clusters, rsv, resv_clusters;
+	struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
+	struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
 
-	*errp = 0;
+	free_clusters  = percpu_counter_read_positive(fcc);
+	dirty_clusters = percpu_counter_read_positive(dcc);
+	resv_clusters = atomic64_read(&sbi->s_resv_clusters);
 
 	/*
-	 * Make sure we use undo access for the bitmap, because it is critical
-	 * that we do the frozen_data COW on bitmap buffers in all cases even
-	 * if the buffer is in BJ_Forget state in the committing transaction.
+	 * r_blocks_count should always be multiple of the cluster ratio so
+	 * we are safe to do a plane bit shift only.
 	 */
-	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
-	fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
-	if (fatal) {
-		*errp = fatal;
-		return -1;
-	}
+	rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
+	      resv_clusters;
 
-	/*
-	 * we don't deal with reservation when
-	 * filesystem is mounted without reservation
-	 * or the file is not a regular file
-	 * or last attempt to allocate a block with reservation turned on failed
-	 */
-	if (my_rsv == NULL ) {
-		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-						grp_goal, count, NULL);
-		goto out;
+	if (free_clusters - (nclusters + rsv + dirty_clusters) <
+					EXT4_FREECLUSTERS_WATERMARK) {
+		free_clusters  = percpu_counter_sum_positive(fcc);
+		dirty_clusters = percpu_counter_sum_positive(dcc);
 	}
-	/*
-	 * grp_goal is a group relative block number (if there is a goal)
-	 * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
-	 * first block is a filesystem wide block number
-	 * first block is the block number of the first block in this group
+	/* Check whether we have space after accounting for current
+	 * dirty clusters & root reserved clusters.
 	 */
-	group_first_block = ext4_group_first_block_no(sb, group);
-	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+	if (free_clusters >= (rsv + nclusters + dirty_clusters))
+		return 1;
 
-	/*
-	 * Basically we will allocate a new block from inode's reservation
-	 * window.
-	 *
-	 * We need to allocate a new reservation window, if:
-	 * a) inode does not have a reservation window; or
-	 * b) last attempt to allocate a block from existing reservation
-	 *    failed; or
-	 * c) we come here with a goal and with a reservation window
-	 *
-	 * We do not need to allocate a new reservation window if we come here
-	 * at the beginning with a goal and the goal is inside the window, or
-	 * we don't have a goal but already have a reservation window.
-	 * then we could go to allocate from the reservation window directly.
-	 */
-	while (1) {
-		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window,
-						grp_goal, group, sb)) {
-			if (my_rsv->rsv_goal_size < *count)
-				my_rsv->rsv_goal_size = *count;
-			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
-							group, bitmap_bh);
-			if (ret < 0)
-				break;			/* failed */
-
-			if (!goal_in_my_reservation(&my_rsv->rsv_window,
-							grp_goal, group, sb))
-				grp_goal = -1;
-		} else if (grp_goal >= 0) {
-			int curr = my_rsv->rsv_end -
-					(grp_goal + group_first_block) + 1;
-
-			if (curr < *count)
-				try_to_extend_reservation(my_rsv, sb,
-							*count - curr);
-		}
+	/* Hm, nope.  Are (enough) root reserved clusters available? */
+	if (uid_eq(sbi->s_resuid, current_fsuid()) ||
+	    (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
+	    capable(CAP_SYS_RESOURCE) ||
+	    (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
 
-		if ((my_rsv->rsv_start > group_last_block) ||
-				(my_rsv->rsv_end < group_first_block)) {
-			rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
-			BUG();
-		}
-		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
-					   grp_goal, &num, &my_rsv->rsv_window);
-		if (ret >= 0) {
-			my_rsv->rsv_alloc_hit += num;
-			*count = num;
-			break;				/* succeed */
-		}
-		num = *count;
+		if (free_clusters >= (nclusters + dirty_clusters +
+				      resv_clusters))
+			return 1;
 	}
-out:
-	if (ret >= 0) {
-		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
-					"bitmap block");
-		fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
-		if (fatal) {
-			*errp = fatal;
-			return -1;
-		}
-		return ret;
+	/* No free blocks. Let's see if we can dip into reserved pool */
+	if (flags & EXT4_MB_USE_RESERVED) {
+		if (free_clusters >= (nclusters + dirty_clusters))
+			return 1;
 	}
 
-	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
-	ext4_journal_release_buffer(handle, bitmap_bh);
-	return ret;
+	return 0;
 }
 
-/**
- * ext4_has_free_blocks()
- * @sbi:		in-core super block structure.
- *
- * Check if filesystem has at least 1 free block available for allocation.
- */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+			     s64 nclusters, unsigned int flags)
 {
-	ext4_fsblk_t free_blocks, root_blocks;
-
-	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	root_blocks = ext4_r_blocks_count(sbi->s_es);
-	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
-		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+	if (ext4_has_free_clusters(sbi, nclusters, flags)) {
+		percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
 		return 0;
-	}
-	return 1;
+	} else
+		return -ENOSPC;
 }
 
 /**
@@ -1572,14 +586,16 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
  *
  * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
  * it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
  * return TRUE.
  *
  * if the total number of retries exceed three times, return FALSE.
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+	if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
+	    (*retries)++ > 3 ||
+	    !EXT4_SB(sb)->s_journal)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1587,336 +603,67 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
 
-/**
- * ext4_new_blocks_old() -- core block(s) allocation function
- * @handle:		handle to this transaction
- * @inode:		file inode
- * @goal:		given target block(filesystem wide)
- * @count:		target number of blocks to allocate
- * @errp:		error code
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
  *
- * ext4_new_blocks uses a goal block to assist allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		pointer to total number of clusters needed
+ * @errp:               error code
  *
+ * Return 1st allocated block number on success, *count stores total account
+ * error stores in errp pointer
  */
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gdp_bh;
-	ext4_group_t group_no;
-	ext4_group_t goal_group;
-	ext4_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
-	ext4_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
-	ext4_fsblk_t ret_block;		/* filesyetem-wide allocated block */
-	ext4_group_t bgi;			/* blockgroup iteration index */
-	int fatal = 0, err;
-	int performed_allocation = 0;
-	ext4_grpblk_t free_blocks;	/* number of free blocks in a group */
-	struct super_block *sb;
-	struct ext4_group_desc *gdp;
-	struct ext4_super_block *es;
-	struct ext4_sb_info *sbi;
-	struct ext4_reserve_window_node *my_rsv = NULL;
-	struct ext4_block_alloc_info *block_i;
-	unsigned short windowsz = 0;
-	ext4_group_t ngroups;
-	unsigned long num = *count;
-
-	*errp = -ENOSPC;
-	sb = inode->i_sb;
-	if (!sb) {
-		printk("ext4_new_block: nonexistent device");
-		return 0;
-	}
-
-	/*
-	 * Check quota for allocation of this block.
-	 */
-	if (DQUOT_ALLOC_BLOCK(inode, num)) {
-		*errp = -EDQUOT;
-		return 0;
-	}
-
-	sbi = EXT4_SB(sb);
-	es = EXT4_SB(sb)->s_es;
-	ext4_debug("goal=%llu.\n", goal);
-	/*
-	 * Allocate a block from reservation only when
-	 * filesystem is mounted with reservation(default,-o reservation), and
-	 * it's a regular file, and
-	 * the desired window size is greater than 0 (One could use ioctl
-	 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
-	 * reservation on that particular file)
-	 */
-	block_i = EXT4_I(inode)->i_block_alloc_info;
-	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
-		my_rsv = &block_i->rsv_window_node;
-
-	if (!ext4_has_free_blocks(sbi)) {
-		*errp = -ENOSPC;
-		goto out;
-	}
-
-	/*
-	 * First, test whether the goal block is free.
-	 */
-	if (goal < le32_to_cpu(es->s_first_data_block) ||
-	    goal >= ext4_blocks_count(es))
-		goal = le32_to_cpu(es->s_first_data_block);
-	ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
-	goal_group = group_no;
-retry_alloc:
-	gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-	if (!gdp)
-		goto io_error;
-
-	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-	/*
-	 * if there is not enough free blocks to make a new resevation
-	 * turn off reservation for this allocation
-	 */
-	if (my_rsv && (free_blocks < windowsz)
-		&& (rsv_is_empty(&my_rsv->rsv_window)))
-		my_rsv = NULL;
-
-	if (free_blocks > 0) {
-		bitmap_bh = read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, grp_target_blk,
-					my_rsv,	&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-
-	ngroups = EXT4_SB(sb)->s_groups_count;
-	smp_rmb();
-
-	/*
-	 * Now search the rest of the groups.  We assume that
-	 * group_no and gdp correctly point to the last group visited.
-	 */
-	for (bgi = 0; bgi < ngroups; bgi++) {
-		group_no++;
-		if (group_no >= ngroups)
-			group_no = 0;
-		gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
-		if (!gdp)
-			goto io_error;
-		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
-		/*
-		 * skip this group if the number of
-		 * free blocks is less than half of the reservation
-		 * window size.
-		 */
-		if (free_blocks <= (windowsz/2))
-			continue;
-
-		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
-			goto io_error;
-		/*
-		 * try to allocate block(s) from this group, without a goal(-1).
-		 */
-		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
-					group_no, bitmap_bh, -1, my_rsv,
-					&num, &fatal);
-		if (fatal)
-			goto out;
-		if (grp_alloc_blk >= 0)
-			goto allocated;
-	}
-	/*
-	 * We may end up a bogus ealier ENOSPC error due to
-	 * filesystem is "full" of reservations, but
-	 * there maybe indeed free blocks avaliable on disk
-	 * In this case, we just forget about the reservations
-	 * just do block allocation as without reservations.
-	 */
-	if (my_rsv) {
-		my_rsv = NULL;
-		windowsz = 0;
-		group_no = goal_group;
-		goto retry_alloc;
-	}
-	/* No space left on the device */
-	*errp = -ENOSPC;
-	goto out;
-
-allocated:
-
-	ext4_debug("using block group %lu(%d)\n",
-			group_no, gdp->bg_free_blocks_count);
-
-	BUFFER_TRACE(gdp_bh, "get_write_access");
-	fatal = ext4_journal_get_write_access(handle, gdp_bh);
-	if (fatal)
-		goto out;
-
-	ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
-
-	if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
-	    in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
-	    in_range(ret_block, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group) ||
-	    in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group)) {
-		ext4_error(sb, "ext4_new_block",
-			    "Allocating block in system zone - "
-			    "blocks from %llu, length %lu",
-			     ret_block, num);
-		goto out;
-	}
-
-	performed_allocation = 1;
-
-#ifdef CONFIG_JBD2_DEBUG
-	{
-		struct buffer_head *debug_bh;
-
-		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, ret_block);
-		if (debug_bh) {
-			BUFFER_TRACE(debug_bh, "state when allocated");
-			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
-			brelse(debug_bh);
-		}
-	}
-	jbd_lock_bh_state(bitmap_bh);
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
-		int i;
-
-		for (i = 0; i < num; i++) {
-			if (ext4_test_bit(grp_alloc_blk+i,
-					bh2jh(bitmap_bh)->b_committed_data)) {
-				printk("%s: block was unexpectedly set in "
-					"b_committed_data\n", __FUNCTION__);
-			}
-		}
-	}
-	ext4_debug("found bit %d\n", grp_alloc_blk);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	jbd_unlock_bh_state(bitmap_bh);
-#endif
-
-	if (ret_block + num - 1 >= ext4_blocks_count(es)) {
-		ext4_error(sb, "ext4_new_block",
-			    "block(%llu) >= blocks count(%llu) - "
-			    "block_group = %lu, es == %p ", ret_block,
-			ext4_blocks_count(es), group_no, es);
-		goto out;
-	}
-
-	/*
-	 * It is up to the caller to add the new buffer to a journal
-	 * list of some description.  We don't know in advance whether
-	 * the caller wants to use it as metadata or data.
-	 */
-	spin_lock(sb_bgl_lock(sbi, group_no));
-	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-	gdp->bg_free_blocks_count =
-			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
-	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
-	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
-	err = ext4_journal_dirty_metadata(handle, gdp_bh);
-	if (!fatal)
-		fatal = err;
-
-	sb->s_dirt = 1;
-	if (fatal)
-		goto out;
-
-	*errp = 0;
-	brelse(bitmap_bh);
-	DQUOT_FREE_BLOCK(inode, *count-num);
-	*count = num;
-	return ret_block;
-
-io_error:
-	*errp = -EIO;
-out:
-	if (fatal) {
-		*errp = fatal;
-		ext4_std_error(sb, fatal);
-	}
-	/*
-	 * Undo the block allocation
-	 */
-	if (!performed_allocation)
-		DQUOT_FREE_BLOCK(inode, *count);
-	brelse(bitmap_bh);
-	return 0;
-}
-
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, int *errp)
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+				  ext4_fsblk_t goal, unsigned int flags,
+				  unsigned long *count, int *errp)
 {
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
 
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		unsigned long count = 1;
-		ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
-		return ret;
-	}
-
 	memset(&ar, 0, sizeof(ar));
+	/* Fill with neighbour allocated blocks */
 	ar.inode = inode;
 	ar.goal = goal;
-	ar.len = 1;
-	ret = ext4_mb_new_blocks(handle, &ar, errp);
-	return ret;
-}
-
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
-	struct ext4_allocation_request ar;
-	ext4_fsblk_t ret;
+	ar.len = count ? *count : 1;
+	ar.flags = flags;
 
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
-		return ret;
-	}
-
-	memset(&ar, 0, sizeof(ar));
-	ar.inode = inode;
-	ar.goal = goal;
-	ar.len = *count;
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
-	*count = ar.len;
+	if (count)
+		*count = ar.len;
+	/*
+	 * Account for the allocated meta blocks.  We will never
+	 * fail EDQUOT for metdata, but we do account for it.
+	 */
+	if (!(*errp) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
+		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		dquot_alloc_block_nofail(inode,
+				EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
+	}
 	return ret;
 }
 
-
 /**
- * ext4_count_free_blocks() -- count filesystem free blocks
+ * ext4_count_free_clusters() -- count filesystem free clusters
  * @sb:		superblock
  *
- * Adds up the number of free blocks from each block group.
+ * Adds up the number of free clusters from each block group.
  */
-ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
 {
 	ext4_fsblk_t desc_count;
 	struct ext4_group_desc *gdp;
 	ext4_group_t i;
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_group_info *grp;
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	ext4_fsblk_t bitmap_count;
-	unsigned long x;
+	unsigned int x;
 	struct buffer_head *bitmap_bh = NULL;
 
 	es = EXT4_SB(sb)->s_es;
@@ -1924,36 +671,43 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	bitmap_count = 0;
 	gdp = NULL;
 
-	smp_rmb();
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+		grp = NULL;
+		if (EXT4_SB(sb)->s_group_info)
+			grp = ext4_get_group_info(sb, i);
+		if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			desc_count += ext4_free_group_clusters(sb, gdp);
 		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, i);
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
 		if (bitmap_bh == NULL)
 			continue;
 
-		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+		x = ext4_count_free(bitmap_bh->b_data,
+				    EXT4_CLUSTERS_PER_GROUP(sb) / 8);
+		printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
+			i, ext4_free_group_clusters(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext4_count_free_blocks: stored = %llu"
-		", computed = %llu, %llu\n",
-		ext4_free_blocks_count(es),
-		desc_count, bitmap_count);
+	printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
+	       ", computed = %llu, %llu\n",
+	       EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
+	       desc_count, bitmap_count);
 	return bitmap_count;
 #else
 	desc_count = 0;
-	smp_rmb();
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+		grp = NULL;
+		if (EXT4_SB(sb)->s_group_info)
+			grp = ext4_get_group_info(sb, i);
+		if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			desc_count += ext4_free_group_clusters(sb, gdp);
 	}
 
 	return desc_count;
@@ -1962,21 +716,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 
 static inline int test_root(ext4_group_t a, int b)
 {
-	int num = b;
-
-	while (a > num)
-		num *= b;
-	return num == a;
-}
-
-static int ext4_group_sparse(ext4_group_t group)
-{
-	if (group <= 1)
-		return 1;
-	if (!(group & 1))
-		return 0;
-	return (test_root(group, 7) || test_root(group, 5) ||
-		test_root(group, 3));
+	while (1) {
+		if (a < b)
+			return 0;
+		if (a == b)
+			return 1;
+		if ((a % b) != 0)
+			return 0;
+		a = a / b;
+	}
 }
 
 /**
@@ -1989,11 +737,26 @@ static int ext4_group_sparse(ext4_group_t group)
  */
 int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
 {
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-				EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
-			!ext4_group_sparse(group))
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	if (group == 0)
+		return 1;
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+		if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
+		    group == le32_to_cpu(es->s_backup_bgs[1]))
+			return 1;
+		return 0;
+	}
+	if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+		return 1;
+	if (!(group & 1))
 		return 0;
-	return 1;
+	if (test_root(group, 3) || (test_root(group, 5)) ||
+	    test_root(group, 7))
+		return 1;
+
+	return 0;
 }
 
 static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
@@ -2011,7 +774,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
 static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
 					ext4_group_t group)
 {
-	return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+	if (!ext4_bg_has_super(sb, group))
+		return 0;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+		return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+	else
+		return EXT4_SB(sb)->s_gdb_count;
 }
 
 /**
@@ -2031,8 +800,82 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
 			metagroup < first_meta_bg)
-		return ext4_bg_num_gdb_nometa(sb,group);
+		return ext4_bg_num_gdb_nometa(sb, group);
 
 	return ext4_bg_num_gdb_meta(sb,group);
 
 }
+
+/*
+ * This function returns the number of file system metadata clusters at
+ * the beginning of a block group, including the reserved gdt blocks.
+ */
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+				     ext4_group_t block_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned num;
+
+	/* Check for superblock and gdt backups in this group */
+	num = ext4_bg_has_super(sb, block_group);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+	    block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
+			  sbi->s_desc_per_block) {
+		if (num) {
+			num += ext4_bg_num_gdb(sb, block_group);
+			num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+		}
+	} else { /* For META_BG_BLOCK_GROUPS */
+		num += ext4_bg_num_gdb(sb, block_group);
+	}
+	return EXT4_NUM_B2C(sbi, num);
+}
+/**
+ *	ext4_inode_to_goal_block - return a hint for block allocation
+ *	@inode: inode for block allocation
+ *
+ *	Return the ideal location to start allocating blocks for a
+ *	newly created inode.
+ */
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_group_t block_group;
+	ext4_grpblk_t colour;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
+	ext4_fsblk_t bg_start;
+	ext4_fsblk_t last_block;
+
+	block_group = ei->i_block_group;
+	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+		/*
+		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+		 * block groups per flexgroup, reserve the first block
+		 * group for directories and special files.  Regular
+		 * files will start at the second block group.  This
+		 * tends to speed up directory access and improves
+		 * fsck times.
+		 */
+		block_group &= ~(flex_size-1);
+		if (S_ISREG(inode->i_mode))
+			block_group++;
+	}
+	bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
+	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+
+	/*
+	 * If we are doing delayed allocation, we don't need take
+	 * colour into account.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC))
+		return bg_start;
+
+	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
+		colour = (current->pid % 16) *
+			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	else
+		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
+	return bg_start + colour;
+}
+
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 420554f8f79..3285aa5a706 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -9,24 +9,94 @@
 
 #include <linux/buffer_head.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4.h"
 
-#ifdef EXT4FS_DEBUG
+unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
+{
+	return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
+}
+
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+				  struct ext4_group_desc *gdp,
+				  struct buffer_head *bh, int sz)
+{
+	__u32 hi;
+	__u32 provided, calculated;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
 
-static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+	provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
+	calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+	if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
+		hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
+		provided |= (hi << 16);
+	} else
+		calculated &= 0xFFFF;
 
-unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+	return provided == calculated;
+}
+
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+				struct ext4_group_desc *gdp,
+				struct buffer_head *bh, int sz)
 {
-	unsigned int i;
-	unsigned long sum = 0;
-
-	if (!map)
-		return (0);
-	for (i = 0; i < numchars; i++)
-		sum += nibblemap[map->b_data[i] & 0xf] +
-			nibblemap[(map->b_data[i] >> 4) & 0xf];
-	return (sum);
+	__u32 csum;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return;
+
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+	gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+	if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
+		gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
 }
 
-#endif  /*  EXT4FS_DEBUG  */
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+				  struct ext4_group_desc *gdp,
+				  struct buffer_head *bh)
+{
+	__u32 hi;
+	__u32 provided, calculated;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
 
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
+
+	provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
+	calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+	if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
+		hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
+		provided |= (hi << 16);
+	} else
+		calculated &= 0xFFFF;
+
+	if (provided == calculated)
+		return 1;
+
+	return 0;
+}
+
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+				struct ext4_group_desc *gdp,
+				struct buffer_head *bh)
+{
+	int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
+	__u32 csum;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return;
+
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+	gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+	if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
+		gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 00000000000..41eb9dcfac7
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,243 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+	struct rb_node	node;
+	ext4_fsblk_t	start_blk;
+	unsigned int	count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init ext4_init_system_zone(void)
+{
+	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, 0);
+	if (ext4_system_zone_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ext4_exit_system_zone(void)
+{
+	kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+		     struct ext4_system_zone *entry2)
+{
+	if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+		return 1;
+	return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+			   ext4_fsblk_t start_blk,
+			   unsigned int count)
+{
+	struct ext4_system_zone *new_entry = NULL, *entry;
+	struct rb_node **n = &sbi->system_blks.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node = NULL;
+
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_system_zone, node);
+		if (start_blk < entry->start_blk)
+			n = &(*n)->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = &(*n)->rb_right;
+		else {
+			if (start_blk + count > (entry->start_blk +
+						 entry->count))
+				entry->count = (start_blk + count -
+						entry->start_blk);
+			new_node = *n;
+			new_entry = rb_entry(new_node, struct ext4_system_zone,
+					     node);
+			break;
+		}
+	}
+
+	if (!new_entry) {
+		new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+					     GFP_KERNEL);
+		if (!new_entry)
+			return -ENOMEM;
+		new_entry->start_blk = start_blk;
+		new_entry->count = count;
+		new_node = &new_entry->node;
+
+		rb_link_node(new_node, parent, n);
+		rb_insert_color(new_node, &sbi->system_blks);
+	}
+
+	/* Can we merge to the left? */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(entry, new_entry)) {
+			new_entry->start_blk = entry->start_blk;
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+
+	/* Can we merge to the right? */
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(new_entry, entry)) {
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+	return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+	struct rb_node *node;
+	struct ext4_system_zone *entry;
+	int first = 1;
+
+	printk(KERN_INFO "System zones: ");
+	node = rb_first(&sbi->system_blks);
+	while (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		printk("%s%llu-%llu", first ? "" : ", ",
+		       entry->start_blk, entry->start_blk + entry->count - 1);
+		first = 0;
+		node = rb_next(node);
+	}
+	printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	int flex_size = ext4_flex_bg_size(sbi);
+	int ret;
+
+	if (!test_opt(sb, BLOCK_VALIDITY)) {
+		if (EXT4_SB(sb)->system_blks.rb_node)
+			ext4_release_system_zone(sb);
+		return 0;
+	}
+	if (EXT4_SB(sb)->system_blks.rb_node)
+		return 0;
+
+	for (i=0; i < ngroups; i++) {
+		if (ext4_bg_has_super(sb, i) &&
+		    ((i < 5) || ((i % flex_size) == 0)))
+			add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+					ext4_bg_num_gdb(sb, i) + 1);
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+				sbi->s_itb_per_group);
+		if (ret)
+			return ret;
+	}
+
+	if (test_opt(sb, DEBUG))
+		debug_print_tree(EXT4_SB(sb));
+	return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+	struct ext4_system_zone	*entry, *n;
+
+	rbtree_postorder_for_each_entry_safe(entry, n,
+			&EXT4_SB(sb)->system_blks, node)
+		kmem_cache_free(ext4_system_zone_cachep, entry);
+
+	EXT4_SB(sb)->system_blks = RB_ROOT;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+			  unsigned int count)
+{
+	struct ext4_system_zone *entry;
+	struct rb_node *n = sbi->system_blks.rb_node;
+
+	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+	    (start_blk + count < start_blk) ||
+	    (start_blk + count > ext4_blocks_count(sbi->s_es))) {
+		sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+		return 0;
+	}
+	while (n) {
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		if (start_blk + count - 1 < entry->start_blk)
+			n = n->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = n->rb_right;
+		else {
+			sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+int ext4_check_blockref(const char *function, unsigned int line,
+			struct inode *inode, __le32 *p, unsigned int max)
+{
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	__le32 *bref = p;
+	unsigned int blk;
+
+	while (bref < p+max) {
+		blk = le32_to_cpu(*bref++);
+		if (blk &&
+		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+						    blk, 1))) {
+			es->s_last_error_block = cpu_to_le64(blk);
+			ext4_error_inode(inode, function, line, blk,
+					 "invalid block");
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2c23bade9aa..ef1bed66c14 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -23,123 +23,135 @@
 
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
+#include "ext4.h"
+#include "xattr.h"
 
-static unsigned char ext4_filetype_table[] = {
-	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
-static int ext4_readdir(struct file *, void *, filldir_t);
-static int ext4_dx_readdir(struct file * filp,
-			   void * dirent, filldir_t filldir);
-static int ext4_release_dir (struct inode * inode,
-				struct file * filp);
-
-const struct file_operations ext4_dir_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= generic_read_dir,
-	.readdir	= ext4_readdir,		/* we take BKL. needed?*/
-	.ioctl		= ext4_ioctl,		/* BKL held */
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= ext4_compat_ioctl,
-#endif
-	.fsync		= ext4_sync_file,
-	.release	= ext4_release_dir,
-};
+static int ext4_dx_readdir(struct file *, struct dir_context *);
 
-
-static unsigned char get_dtype(struct super_block *sb, int filetype)
+/**
+ * Check if the given dir-inode refers to an htree-indexed directory
+ * (or a directory which could potentially get converted to use htree
+ * indexing).
+ *
+ * Return 1 if it is a dx dir, 0 if not
+ */
+static int is_dx_dir(struct inode *inode)
 {
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
-	    (filetype >= EXT4_FT_MAX))
-		return DT_UNKNOWN;
+	struct super_block *sb = inode->i_sb;
 
-	return (ext4_filetype_table[filetype]);
-}
+	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+		     EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+	    ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+	     ((inode->i_size >> sb->s_blocksize_bits) == 1) ||
+	     ext4_has_inline_data(inode)))
+		return 1;
 
+	return 0;
+}
 
-int ext4_check_dir_entry (const char * function, struct inode * dir,
-			  struct ext4_dir_entry_2 * de,
-			  struct buffer_head * bh,
-			  unsigned long offset)
+/*
+ * Return 0 if the directory entry is OK, and 1 if there is a problem
+ *
+ * Note: this is the opposite of what ext2 and ext3 historically returned...
+ *
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
+ */
+int __ext4_check_dir_entry(const char *function, unsigned int line,
+			   struct inode *dir, struct file *filp,
+			   struct ext4_dir_entry_2 *de,
+			   struct buffer_head *bh, char *buf, int size,
+			   unsigned int offset)
 {
-	const char * error_msg = NULL;
-	const int rlen = ext4_rec_len_from_disk(de->rec_len);
+	const char *error_msg = NULL;
+	const int rlen = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
 
-	if (rlen < EXT4_DIR_REC_LEN(1))
+	if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
 		error_msg = "rec_len is smaller than minimal";
-	else if (rlen % 4 != 0)
+	else if (unlikely(rlen % 4 != 0))
 		error_msg = "rec_len % 4 != 0";
-	else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+	else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
 		error_msg = "rec_len is too small for name_len";
-	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
-		error_msg = "directory entry across blocks";
-	else if (le32_to_cpu(de->inode) >
-			le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+	else if (unlikely(((char *) de - buf) + rlen > size))
+		error_msg = "directory entry across range";
+	else if (unlikely(le32_to_cpu(de->inode) >
+			le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
 		error_msg = "inode out of bounds";
+	else
+		return 0;
 
-	if (error_msg != NULL)
-		ext4_error (dir->i_sb, function,
-			"bad entry in directory #%lu: %s - "
-			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
-			dir->i_ino, error_msg, offset,
-			(unsigned long) le32_to_cpu(de->inode),
-			rlen, de->name_len);
-	return error_msg == NULL ? 1 : 0;
+	if (filp)
+		ext4_error_file(filp, function, line, bh->b_blocknr,
+				"bad entry in directory: %s - offset=%u(%u), "
+				"inode=%u, rec_len=%d, name_len=%d",
+				error_msg, (unsigned) (offset % size),
+				offset, le32_to_cpu(de->inode),
+				rlen, de->name_len);
+	else
+		ext4_error_inode(dir, function, line, bh->b_blocknr,
+				"bad entry in directory: %s - offset=%u(%u), "
+				"inode=%u, rec_len=%d, name_len=%d",
+				error_msg, (unsigned) (offset % size),
+				offset, le32_to_cpu(de->inode),
+				rlen, de->name_len);
+
+	return 1;
 }
 
-static int ext4_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext4_readdir(struct file *file, struct dir_context *ctx)
 {
-	int error = 0;
-	unsigned long offset;
-	int i, stored;
+	unsigned int offset;
+	int i;
 	struct ext4_dir_entry_2 *de;
-	struct super_block *sb;
 	int err;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	int ret = 0;
-
-	sb = inode->i_sb;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
+	int dir_has_error = 0;
 
-	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
-				    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
-	    ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
-	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
-		err = ext4_dx_readdir(filp, dirent, filldir);
+	if (is_dx_dir(inode)) {
+		err = ext4_dx_readdir(file, ctx);
 		if (err != ERR_BAD_DX_DIR) {
-			ret = err;
-			goto out;
+			return err;
 		}
 		/*
 		 * We don't set the inode dirty flag since it's not
 		 * critical that it get flushed back to the disk.
 		 */
-		EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(file_inode(file),
+				      EXT4_INODE_INDEX);
 	}
-	stored = 0;
-	offset = filp->f_pos & (sb->s_blocksize - 1);
 
-	while (!error && !stored && filp->f_pos < inode->i_size) {
-		ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
-		struct buffer_head map_bh;
+	if (ext4_has_inline_data(inode)) {
+		int has_inline_data = 1;
+		int ret = ext4_read_inline_dir(file, ctx,
+					   &has_inline_data);
+		if (has_inline_data)
+			return ret;
+	}
+
+	offset = ctx->pos & (sb->s_blocksize - 1);
+
+	while (ctx->pos < inode->i_size) {
+		struct ext4_map_blocks map;
 		struct buffer_head *bh = NULL;
 
-		map_bh.b_state = 0;
-		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+		map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		map.m_len = 1;
+		err = ext4_map_blocks(NULL, inode, &map, 0);
 		if (err > 0) {
-			pgoff_t index = map_bh.b_blocknr >>
+			pgoff_t index = map.m_pblk >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
-			if (!ra_has_index(&filp->f_ra, index))
+			if (!ra_has_index(&file->f_ra, index))
 				page_cache_sync_readahead(
 					sb->s_bdev->bd_inode->i_mapping,
-					&filp->f_ra, filp,
+					&file->f_ra, file,
 					index, 1);
-			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-			bh = ext4_bread(NULL, inode, blk, 0, &err);
+			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
 		}
 
 		/*
@@ -147,22 +159,38 @@ static int ext4_readdir(struct file * filp,
 		 * of recovering data when there's a bad sector
 		 */
 		if (!bh) {
-			ext4_error (sb, "ext4_readdir",
-				"directory #%lu contains a hole at offset %lu",
-				inode->i_ino, (unsigned long)filp->f_pos);
+			if (!dir_has_error) {
+				EXT4_ERROR_FILE(file, 0,
+						"directory contains a "
+						"hole at offset %llu",
+					   (unsigned long long) ctx->pos);
+				dir_has_error = 1;
+			}
 			/* corrupt size?  Maybe no more blocks to read */
-			if (filp->f_pos > inode->i_blocks << 9)
+			if (ctx->pos > inode->i_blocks << 9)
 				break;
-			filp->f_pos += sb->s_blocksize - offset;
+			ctx->pos += sb->s_blocksize - offset;
 			continue;
 		}
 
-revalidate:
+		/* Check the checksum */
+		if (!buffer_verified(bh) &&
+		    !ext4_dirent_csum_verify(inode,
+				(struct ext4_dir_entry *)bh->b_data)) {
+			EXT4_ERROR_FILE(file, 0, "directory fails checksum "
+					"at offset %llu",
+					(unsigned long long)ctx->pos);
+			ctx->pos += sb->s_blocksize - offset;
+			brelse(bh);
+			continue;
+		}
+		set_buffer_verified(bh);
+
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
 		 * to make sure. */
-		if (filp->f_version != inode->i_version) {
+		if (file->f_version != inode->i_version) {
 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
 				de = (struct ext4_dir_entry_2 *)
 					(bh->b_data + i);
@@ -172,78 +200,136 @@ revalidate:
 				 * least that it is non-zero.  A
 				 * failure will be detected in the
 				 * dirent test below. */
-				if (ext4_rec_len_from_disk(de->rec_len)
-						< EXT4_DIR_REC_LEN(1))
+				if (ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
 					break;
-				i += ext4_rec_len_from_disk(de->rec_len);
+				i += ext4_rec_len_from_disk(de->rec_len,
+							    sb->s_blocksize);
 			}
 			offset = i;
-			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
 				| offset;
-			filp->f_version = inode->i_version;
+			file->f_version = inode->i_version;
 		}
 
-		while (!error && filp->f_pos < inode->i_size
+		while (ctx->pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
-			if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
-						   bh, offset)) {
+			if (ext4_check_dir_entry(inode, file, de, bh,
+						 bh->b_data, bh->b_size,
+						 offset)) {
 				/*
-				 * On error, skip the f_pos to the next block
+				 * On error, skip to the next block
 				 */
-				filp->f_pos = (filp->f_pos |
+				ctx->pos = (ctx->pos |
 						(sb->s_blocksize - 1)) + 1;
-				brelse (bh);
-				ret = stored;
-				goto out;
+				break;
 			}
-			offset += ext4_rec_len_from_disk(de->rec_len);
+			offset += ext4_rec_len_from_disk(de->rec_len,
+					sb->s_blocksize);
 			if (le32_to_cpu(de->inode)) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation.
-				 */
-				u64 version = filp->f_version;
-
-				error = filldir(dirent, de->name,
+				if (!dir_emit(ctx, de->name,
 						de->name_len,
-						filp->f_pos,
 						le32_to_cpu(de->inode),
-						get_dtype(sb, de->file_type));
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored ++;
+						get_dtype(sb, de->file_type))) {
+					brelse(bh);
+					return 0;
+				}
 			}
-			filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+			ctx->pos += ext4_rec_len_from_disk(de->rec_len,
+						sb->s_blocksize);
 		}
 		offset = 0;
-		brelse (bh);
+		brelse(bh);
+		if (ctx->pos < inode->i_size) {
+			if (!dir_relax(inode))
+				return 0;
+		}
 	}
-out:
-	return ret;
+	return 0;
+}
+
+static inline int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+	return is_compat_task();
+#else
+	return (BITS_PER_LONG == 32);
+#endif
 }
 
 /*
  * These functions convert from the major/minor hash to an f_pos
- * value.
+ * value for dx directories
  *
- * Currently we only use major hash numer.  This is unfortunate, but
- * on 32-bit machines, the same VFS interface is used for lseek and
- * llseek, so if we use the 64 bit offset, then the 32-bit versions of
- * lseek/telldir/seekdir will blow out spectacularly, and from within
- * the ext2 low-level routine, we don't know if we're being called by
- * a 64-bit version of the system call or the 32-bit version of the
- * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
- * cookie.  Sigh.
+ * Upper layer (for example NFS) should specify FMODE_32BITHASH or
+ * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
+ * directly on both 32-bit and 64-bit nodes, under such case, neither
+ * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
  */
-#define hash2pos(major, minor)	(major >> 1)
-#define pos2maj_hash(pos)	((pos << 1) & 0xffffffff)
-#define pos2min_hash(pos)	(0)
+static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return major >> 1;
+	else
+		return ((__u64)(major >> 1) << 32) | (__u64)minor;
+}
+
+static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return (pos << 1) & 0xffffffff;
+	else
+		return ((pos >> 32) << 1) & 0xffffffff;
+}
+
+static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return 0;
+	else
+		return pos & 0xffffffff;
+}
+
+/*
+ * Return 32- or 64-bit end-of-file for dx directories
+ */
+static inline loff_t ext4_get_htree_eof(struct file *filp)
+{
+	if ((filp->f_mode & FMODE_32BITHASH) ||
+	    (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+		return EXT4_HTREE_EOF_32BIT;
+	else
+		return EXT4_HTREE_EOF_64BIT;
+}
+
+
+/*
+ * ext4_dir_llseek() calls generic_file_llseek_size to handle htree
+ * directories, where the "offset" is in terms of the filename hash
+ * value instead of the byte offset.
+ *
+ * Because we may return a 64-bit hash that is well beyond offset limits,
+ * we need to pass the max hash as the maximum allowable offset in
+ * the htree directory case.
+ *
+ * For non-htree, ext4_llseek already chooses the proper max offset.
+ */
+static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	int dx_dir = is_dx_dir(inode);
+	loff_t htree_max = ext4_get_htree_eof(file);
+
+	if (likely(dx_dir))
+		return generic_file_llseek_size(file, offset, whence,
+						    htree_max, htree_max);
+	else
+		return ext4_llseek(file, offset, whence);
+}
 
 /*
  * This structure holds the nodes of the red-black tree used to store
@@ -266,59 +352,29 @@ struct fname {
  */
 static void free_rb_tree_fname(struct rb_root *root)
 {
-	struct rb_node	*n = root->rb_node;
-	struct rb_node	*parent;
-	struct fname	*fname;
-
-	while (n) {
-		/* Do the node's children first */
-		if ((n)->rb_left) {
-			n = n->rb_left;
-			continue;
-		}
-		if (n->rb_right) {
-			n = n->rb_right;
-			continue;
-		}
-		/*
-		 * The node has no children; free it, and then zero
-		 * out parent's link to it.  Finally go to the
-		 * beginning of the loop and try to free the parent
-		 * node.
-		 */
-		parent = rb_parent(n);
-		fname = rb_entry(n, struct fname, rb_hash);
+	struct fname *fname, *next;
+
+	rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
 		while (fname) {
-			struct fname * old = fname;
+			struct fname *old = fname;
 			fname = fname->next;
-			kfree (old);
+			kfree(old);
 		}
-		if (!parent)
-			root->rb_node = NULL;
-		else if (parent->rb_left == n)
-			parent->rb_left = NULL;
-		else if (parent->rb_right == n)
-			parent->rb_right = NULL;
-		n = parent;
-	}
-	root->rb_node = NULL;
+
+	*root = RB_ROOT;
 }
 
 
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
+							   loff_t pos)
 {
 	struct dir_private_info *p;
 
-	p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 	if (!p)
 		return NULL;
-	p->root.rb_node = NULL;
-	p->curr_node = NULL;
-	p->extra_fname = NULL;
-	p->last_pos = 0;
-	p->curr_hash = pos2maj_hash(pos);
-	p->curr_minor_hash = pos2min_hash(pos);
-	p->next_hash = 0;
+	p->curr_hash = pos2maj_hash(filp, pos);
+	p->curr_minor_hash = pos2min_hash(filp, pos);
 	return p;
 }
 
@@ -336,11 +392,11 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 			     struct ext4_dir_entry_2 *dirent)
 {
 	struct rb_node **p, *parent = NULL;
-	struct fname * fname, *new_fn;
+	struct fname *fname, *new_fn;
 	struct dir_private_info *info;
 	int len;
 
-	info = (struct dir_private_info *) dir_file->private_data;
+	info = dir_file->private_data;
 	p = &info->root.rb_node;
 
 	/* Create and allocate the fname structure */
@@ -393,73 +449,69 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
  * for all entres on the fname linked list.  (Normally there is only
  * one entry on the linked list, unless there are 62 bit hash collisions.)
  */
-static int call_filldir(struct file * filp, void * dirent,
-			filldir_t filldir, struct fname *fname)
+static int call_filldir(struct file *file, struct dir_context *ctx,
+			struct fname *fname)
 {
-	struct dir_private_info *info = filp->private_data;
-	loff_t	curr_pos;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct super_block * sb;
-	int error;
-
-	sb = inode->i_sb;
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
+	struct super_block *sb = inode->i_sb;
 
 	if (!fname) {
-		printk("call_filldir: called with null fname?!?\n");
+		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
+			 "called with null fname?!?", __func__, __LINE__,
+			 inode->i_ino, current->comm);
 		return 0;
 	}
-	curr_pos = hash2pos(fname->hash, fname->minor_hash);
+	ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
 	while (fname) {
-		error = filldir(dirent, fname->name,
-				fname->name_len, curr_pos,
+		if (!dir_emit(ctx, fname->name,
+				fname->name_len,
 				fname->inode,
-				get_dtype(sb, fname->file_type));
-		if (error) {
-			filp->f_pos = curr_pos;
-			info->extra_fname = fname->next;
-			return error;
+				get_dtype(sb, fname->file_type))) {
+			info->extra_fname = fname;
+			return 1;
 		}
 		fname = fname->next;
 	}
 	return 0;
 }
 
-static int ext4_dx_readdir(struct file * filp,
-			 void * dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
 {
-	struct dir_private_info *info = filp->private_data;
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct dir_private_info *info = file->private_data;
+	struct inode *inode = file_inode(file);
 	struct fname *fname;
 	int	ret;
 
 	if (!info) {
-		info = create_dir_info(filp->f_pos);
+		info = ext4_htree_create_dir_info(file, ctx->pos);
 		if (!info)
 			return -ENOMEM;
-		filp->private_data = info;
+		file->private_data = info;
 	}
 
-	if (filp->f_pos == EXT4_HTREE_EOF)
+	if (ctx->pos == ext4_get_htree_eof(file))
 		return 0;	/* EOF */
 
 	/* Some one has messed with f_pos; reset the world */
-	if (info->last_pos != filp->f_pos) {
+	if (info->last_pos != ctx->pos) {
 		free_rb_tree_fname(&info->root);
 		info->curr_node = NULL;
 		info->extra_fname = NULL;
-		info->curr_hash = pos2maj_hash(filp->f_pos);
-		info->curr_minor_hash = pos2min_hash(filp->f_pos);
+		info->curr_hash = pos2maj_hash(file, ctx->pos);
+		info->curr_minor_hash = pos2min_hash(file, ctx->pos);
 	}
 
 	/*
 	 * If there are any leftover names on the hash collision
 	 * chain, return them first.
 	 */
-	if (info->extra_fname &&
-	    call_filldir(filp, dirent, filldir, info->extra_fname))
-		goto finished;
-
-	if (!info->curr_node)
+	if (info->extra_fname) {
+		if (call_filldir(file, ctx, info->extra_fname))
+			goto finished;
+		info->extra_fname = NULL;
+		goto next_node;
+	} else if (!info->curr_node)
 		info->curr_node = rb_first(&info->root);
 
 	while (1) {
@@ -469,17 +521,17 @@ static int ext4_dx_readdir(struct file * filp,
 		 * cached entries.
 		 */
 		if ((!info->curr_node) ||
-		    (filp->f_version != inode->i_version)) {
+		    (file->f_version != inode->i_version)) {
 			info->curr_node = NULL;
 			free_rb_tree_fname(&info->root);
-			filp->f_version = inode->i_version;
-			ret = ext4_htree_fill_tree(filp, info->curr_hash,
+			file->f_version = inode->i_version;
+			ret = ext4_htree_fill_tree(file, info->curr_hash,
 						   info->curr_minor_hash,
 						   &info->next_hash);
 			if (ret < 0)
 				return ret;
 			if (ret == 0) {
-				filp->f_pos = EXT4_HTREE_EOF;
+				ctx->pos = ext4_get_htree_eof(file);
 				break;
 			}
 			info->curr_node = rb_first(&info->root);
@@ -488,13 +540,18 @@ static int ext4_dx_readdir(struct file * filp,
 		fname = rb_entry(info->curr_node, struct fname, rb_hash);
 		info->curr_hash = fname->hash;
 		info->curr_minor_hash = fname->minor_hash;
-		if (call_filldir(filp, dirent, filldir, fname))
+		if (call_filldir(file, ctx, fname))
 			break;
-
+	next_node:
 		info->curr_node = rb_next(info->curr_node);
-		if (!info->curr_node) {
+		if (info->curr_node) {
+			fname = rb_entry(info->curr_node, struct fname,
+					 rb_hash);
+			info->curr_hash = fname->hash;
+			info->curr_minor_hash = fname->minor_hash;
+		} else {
 			if (info->next_hash == ~0) {
-				filp->f_pos = EXT4_HTREE_EOF;
+				ctx->pos = ext4_get_htree_eof(file);
 				break;
 			}
 			info->curr_hash = info->next_hash;
@@ -502,14 +559,26 @@ static int ext4_dx_readdir(struct file * filp,
 		}
 	}
 finished:
-	info->last_pos = filp->f_pos;
+	info->last_pos = ctx->pos;
 	return 0;
 }
 
-static int ext4_release_dir (struct inode * inode, struct file * filp)
+static int ext4_release_dir(struct inode *inode, struct file *filp)
 {
 	if (filp->private_data)
 		ext4_htree_free_dir_info(filp->private_data);
 
 	return 0;
 }
+
+const struct file_operations ext4_dir_operations = {
+	.llseek		= ext4_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate	= ext4_readdir,
+	.unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext4_compat_ioctl,
+#endif
+	.fsync		= ext4_sync_file,
+	.release	= ext4_release_dir,
+};
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
new file mode 100644
index 00000000000..7cc5a0e2368
--- /dev/null
+++ b/fs/ext4/ext4.h
@@ -0,0 +1,2818 @@
+/*
+ *  ext4.h
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/include/linux/minix_fs.h
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#ifndef _EXT4_H
+#define _EXT4_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/magic.h>
+#include <linux/jbd2.h>
+#include <linux/quota.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
+#include <crypto/hash.h>
+#include <linux/falloc.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
+
+/*
+ * The fourth extended filesystem constants/structures
+ */
+
+/*
+ * Define EXT4FS_DEBUG to produce debug messages
+ */
+#undef EXT4FS_DEBUG
+
+/*
+ * Debug code
+ */
+#ifdef EXT4FS_DEBUG
+#define ext4_debug(f, a...)						\
+	do {								\
+		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
+			__FILE__, __LINE__, __func__);			\
+		printk(KERN_DEBUG f, ## a);				\
+	} while (0)
+#else
+#define ext4_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+#define EXT4_ERROR_INODE(inode, fmt, a...) \
+	ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
+
+#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...)			\
+	ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
+
+#define EXT4_ERROR_FILE(file, block, fmt, a...)				\
+	ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
+
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+/*
+ * Flags used in mballoc's allocation_context flags field.
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
+
+/* prefer goal again. length */
+#define EXT4_MB_HINT_MERGE		0x0001
+/* blocks already reserved */
+#define EXT4_MB_HINT_RESERVED		0x0002
+/* metadata is being allocated */
+#define EXT4_MB_HINT_METADATA		0x0004
+/* first blocks in the file */
+#define EXT4_MB_HINT_FIRST		0x0008
+/* search for the best chunk */
+#define EXT4_MB_HINT_BEST		0x0010
+/* data is being allocated */
+#define EXT4_MB_HINT_DATA		0x0020
+/* don't preallocate (for tails) */
+#define EXT4_MB_HINT_NOPREALLOC		0x0040
+/* allocate for locality group */
+#define EXT4_MB_HINT_GROUP_ALLOC	0x0080
+/* allocate goal blocks or none */
+#define EXT4_MB_HINT_GOAL_ONLY		0x0100
+/* goal is meaningful */
+#define EXT4_MB_HINT_TRY_GOAL		0x0200
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED	0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC		0x0800
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS		0x1000
+/* Use blocks from reserved pool */
+#define EXT4_MB_USE_RESERVED		0x2000
+
+struct ext4_allocation_request {
+	/* target inode for block we're allocating */
+	struct inode *inode;
+	/* how many blocks we want to allocate */
+	unsigned int len;
+	/* logical block in target inode */
+	ext4_lblk_t logical;
+	/* the closest logical allocated block to the left */
+	ext4_lblk_t lleft;
+	/* the closest logical allocated block to the right */
+	ext4_lblk_t lright;
+	/* phys. target (a hint) */
+	ext4_fsblk_t goal;
+	/* phys. block for the closest logical allocated block to the left */
+	ext4_fsblk_t pleft;
+	/* phys. block for the closest logical allocated block to the right */
+	ext4_fsblk_t pright;
+	/* flags. see above EXT4_MB_HINT_* */
+	unsigned int flags;
+};
+
+/*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW		(1 << BH_New)
+#define EXT4_MAP_MAPPED		(1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY	(1 << BH_Boundary)
+/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
+ * ext4_map_blocks wants to know whether or not the underlying cluster has
+ * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
+ * the requested mapping was from previously mapped (or delayed allocated)
+ * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
+ * should never appear on buffer_head's state flags.
+ */
+#define EXT4_MAP_FROM_CLUSTER	(1 << BH_AllocFromCluster)
+#define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+				 EXT4_MAP_FROM_CLUSTER)
+
+struct ext4_map_blocks {
+	ext4_fsblk_t m_pblk;
+	ext4_lblk_t m_lblk;
+	unsigned int m_len;
+	unsigned int m_flags;
+};
+
+/*
+ * Flags for ext4_io_end->flags
+ */
+#define	EXT4_IO_END_UNWRITTEN	0x0001
+
+/*
+ * For converting unwritten extents on a work queue. 'handle' is used for
+ * buffered writeback.
+ */
+typedef struct ext4_io_end {
+	struct list_head	list;		/* per-file finished IO list */
+	handle_t		*handle;	/* handle reserved for extent
+						 * conversion */
+	struct inode		*inode;		/* file being written to */
+	struct bio		*bio;		/* Linked list of completed
+						 * bios covering the extent */
+	unsigned int		flag;		/* unwritten or not */
+	loff_t			offset;		/* offset in the file */
+	ssize_t			size;		/* size of the extent */
+	atomic_t		count;		/* reference counter */
+} ext4_io_end_t;
+
+struct ext4_io_submit {
+	int			io_op;
+	struct bio		*io_bio;
+	ext4_io_end_t		*io_end;
+	sector_t		io_next_block;
+};
+
+/*
+ * Special inodes numbers
+ */
+#define	EXT4_BAD_INO		 1	/* Bad blocks inode */
+#define EXT4_ROOT_INO		 2	/* Root inode */
+#define EXT4_USR_QUOTA_INO	 3	/* User quota inode */
+#define EXT4_GRP_QUOTA_INO	 4	/* Group quota inode */
+#define EXT4_BOOT_LOADER_INO	 5	/* Boot loader inode */
+#define EXT4_UNDEL_DIR_INO	 6	/* Undelete directory inode */
+#define EXT4_RESIZE_INO		 7	/* Reserved group descriptors inode */
+#define EXT4_JOURNAL_INO	 8	/* Journal inode */
+
+/* First non-reserved inode for old ext4 filesystems */
+#define EXT4_GOOD_OLD_FIRST_INO	11
+
+/*
+ * Maximal count of links to a file
+ */
+#define EXT4_LINK_MAX		65000
+
+/*
+ * Macro-instructions used to manage several block sizes
+ */
+#define EXT4_MIN_BLOCK_SIZE		1024
+#define	EXT4_MAX_BLOCK_SIZE		65536
+#define EXT4_MIN_BLOCK_LOG_SIZE		10
+#define EXT4_MAX_BLOCK_LOG_SIZE		16
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE(s)		((s)->s_blocksize)
+#else
+# define EXT4_BLOCK_SIZE(s)		(EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+#endif
+#define	EXT4_ADDR_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / sizeof(__u32))
+#define EXT4_CLUSTER_SIZE(s)		(EXT4_BLOCK_SIZE(s) << \
+					 EXT4_SB(s)->s_cluster_bits)
+#ifdef __KERNEL__
+# define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
+# define EXT4_CLUSTER_BITS(s)		(EXT4_SB(s)->s_cluster_bits)
+#else
+# define EXT4_BLOCK_SIZE_BITS(s)	((s)->s_log_block_size + 10)
+#endif
+#ifdef __KERNEL__
+#define	EXT4_ADDR_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_addr_per_block_bits)
+#define EXT4_INODE_SIZE(s)		(EXT4_SB(s)->s_inode_size)
+#define EXT4_FIRST_INO(s)		(EXT4_SB(s)->s_first_ino)
+#else
+#define EXT4_INODE_SIZE(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+				 EXT4_GOOD_OLD_INODE_SIZE : \
+				 (s)->s_inode_size)
+#define EXT4_FIRST_INO(s)	(((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
+				 EXT4_GOOD_OLD_FIRST_INO : \
+				 (s)->s_first_ino)
+#endif
+#define EXT4_BLOCK_ALIGN(size, blkbits)		ALIGN((size), (1 << (blkbits)))
+
+/* Translate a block number to a cluster number */
+#define EXT4_B2C(sbi, blk)	((blk) >> (sbi)->s_cluster_bits)
+/* Translate a cluster number to a block number */
+#define EXT4_C2B(sbi, cluster)	((cluster) << (sbi)->s_cluster_bits)
+/* Translate # of blks to # of clusters */
+#define EXT4_NUM_B2C(sbi, blks)	(((blks) + (sbi)->s_cluster_ratio - 1) >> \
+				 (sbi)->s_cluster_bits)
+/* Mask out the low bits to get the starting block of the cluster */
+#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &				\
+				  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &				\
+				  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Get the cluster offset */
+#define EXT4_PBLK_COFF(s, pblk) ((pblk) &				\
+				 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_COFF(s, lblk) ((lblk) &				\
+				 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+
+/*
+ * Structure of a blocks group descriptor
+ */
+struct ext4_group_desc
+{
+	__le32	bg_block_bitmap_lo;	/* Blocks bitmap block */
+	__le32	bg_inode_bitmap_lo;	/* Inodes bitmap block */
+	__le32	bg_inode_table_lo;	/* Inodes table block */
+	__le16	bg_free_blocks_count_lo;/* Free blocks count */
+	__le16	bg_free_inodes_count_lo;/* Free inodes count */
+	__le16	bg_used_dirs_count_lo;	/* Directories count */
+	__le16	bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
+	__le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
+	__le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
+	__le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
+	__le16  bg_itable_unused_lo;	/* Unused inodes count */
+	__le16  bg_checksum;		/* crc16(sb_uuid+group+desc) */
+	__le32	bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
+	__le32	bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
+	__le32	bg_inode_table_hi;	/* Inodes table block MSB */
+	__le16	bg_free_blocks_count_hi;/* Free blocks count MSB */
+	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
+	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
+	__le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
+	__le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
+	__le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
+	__le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
+	__u32   bg_reserved;
+};
+
+#define EXT4_BG_INODE_BITMAP_CSUM_HI_END	\
+	(offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
+	 sizeof(__le16))
+#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END	\
+	(offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
+	 sizeof(__le16))
+
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+	atomic64_t	free_clusters;
+	atomic_t	free_inodes;
+	atomic_t	used_dirs;
+};
+
+#define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
+#define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
+#define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
+
+/*
+ * Macro-instructions used to manage group descriptors
+ */
+#define EXT4_MIN_DESC_SIZE		32
+#define EXT4_MIN_DESC_SIZE_64BIT	64
+#define	EXT4_MAX_DESC_SIZE		EXT4_MIN_BLOCK_SIZE
+#define EXT4_DESC_SIZE(s)		(EXT4_SB(s)->s_desc_size)
+#ifdef __KERNEL__
+# define EXT4_BLOCKS_PER_GROUP(s)	(EXT4_SB(s)->s_blocks_per_group)
+# define EXT4_CLUSTERS_PER_GROUP(s)	(EXT4_SB(s)->s_clusters_per_group)
+# define EXT4_DESC_PER_BLOCK(s)		(EXT4_SB(s)->s_desc_per_block)
+# define EXT4_INODES_PER_GROUP(s)	(EXT4_SB(s)->s_inodes_per_group)
+# define EXT4_DESC_PER_BLOCK_BITS(s)	(EXT4_SB(s)->s_desc_per_block_bits)
+#else
+# define EXT4_BLOCKS_PER_GROUP(s)	((s)->s_blocks_per_group)
+# define EXT4_DESC_PER_BLOCK(s)		(EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
+# define EXT4_INODES_PER_GROUP(s)	((s)->s_inodes_per_group)
+#endif
+
+/*
+ * Constants relative to the data blocks
+ */
+#define	EXT4_NDIR_BLOCKS		12
+#define	EXT4_IND_BLOCK			EXT4_NDIR_BLOCKS
+#define	EXT4_DIND_BLOCK			(EXT4_IND_BLOCK + 1)
+#define	EXT4_TIND_BLOCK			(EXT4_DIND_BLOCK + 1)
+#define	EXT4_N_BLOCKS			(EXT4_TIND_BLOCK + 1)
+
+/*
+ * Inode flags
+ */
+#define	EXT4_SECRM_FL			0x00000001 /* Secure deletion */
+#define	EXT4_UNRM_FL			0x00000002 /* Undelete */
+#define	EXT4_COMPR_FL			0x00000004 /* Compress file */
+#define EXT4_SYNC_FL			0x00000008 /* Synchronous updates */
+#define EXT4_IMMUTABLE_FL		0x00000010 /* Immutable file */
+#define EXT4_APPEND_FL			0x00000020 /* writes to file may only append */
+#define EXT4_NODUMP_FL			0x00000040 /* do not dump file */
+#define EXT4_NOATIME_FL			0x00000080 /* do not update atime */
+/* Reserved for compression usage... */
+#define EXT4_DIRTY_FL			0x00000100
+#define EXT4_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
+#define EXT4_NOCOMPR_FL			0x00000400 /* Don't compress */
+#define EXT4_ECOMPR_FL			0x00000800 /* Compression error */
+/* End compression flags --- maybe not all used */
+#define EXT4_INDEX_FL			0x00001000 /* hash-indexed directory */
+#define EXT4_IMAGIC_FL			0x00002000 /* AFS directory */
+#define EXT4_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
+#define EXT4_NOTAIL_FL			0x00008000 /* file tail should not be merged */
+#define EXT4_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
+#define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
+#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
+#define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL	        0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL		0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_INLINE_DATA_FL		0x10000000 /* Inode has inline data. */
+#define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
+
+#define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x004380FF /* User modifiable flags */
+
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & EXT4_REG_FLMASK;
+	else
+		return flags & EXT4_OTHER_FLMASK;
+}
+
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+	EXT4_INODE_SECRM	= 0,	/* Secure deletion */
+	EXT4_INODE_UNRM		= 1,	/* Undelete */
+	EXT4_INODE_COMPR	= 2,	/* Compress file */
+	EXT4_INODE_SYNC		= 3,	/* Synchronous updates */
+	EXT4_INODE_IMMUTABLE	= 4,	/* Immutable file */
+	EXT4_INODE_APPEND	= 5,	/* writes to file may only append */
+	EXT4_INODE_NODUMP	= 6,	/* do not dump file */
+	EXT4_INODE_NOATIME	= 7,	/* do not update atime */
+/* Reserved for compression usage... */
+	EXT4_INODE_DIRTY	= 8,
+	EXT4_INODE_COMPRBLK	= 9,	/* One or more compressed clusters */
+	EXT4_INODE_NOCOMPR	= 10,	/* Don't compress */
+	EXT4_INODE_ECOMPR	= 11,	/* Compression error */
+/* End compression flags --- maybe not all used */
+	EXT4_INODE_INDEX	= 12,	/* hash-indexed directory */
+	EXT4_INODE_IMAGIC	= 13,	/* AFS directory */
+	EXT4_INODE_JOURNAL_DATA	= 14,	/* file data should be journaled */
+	EXT4_INODE_NOTAIL	= 15,	/* file tail should not be merged */
+	EXT4_INODE_DIRSYNC	= 16,	/* dirsync behaviour (directories only) */
+	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
+	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
+	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
+	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
+	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
+	EXT4_INODE_INLINE_DATA	= 28,	/* Data in inode. */
+	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
+};
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, we use a
+ * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
+ * any extra space in the compiled kernel image, otherwise, the build will fail.
+ * It's important that these values are the same, since we are using
+ * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
+ * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
+ * values found in ext2, ext3 and ext4 filesystems, and of course the values
+ * defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
+
+static inline void ext4_check_flag_values(void)
+{
+	CHECK_FLAG_VALUE(SECRM);
+	CHECK_FLAG_VALUE(UNRM);
+	CHECK_FLAG_VALUE(COMPR);
+	CHECK_FLAG_VALUE(SYNC);
+	CHECK_FLAG_VALUE(IMMUTABLE);
+	CHECK_FLAG_VALUE(APPEND);
+	CHECK_FLAG_VALUE(NODUMP);
+	CHECK_FLAG_VALUE(NOATIME);
+	CHECK_FLAG_VALUE(DIRTY);
+	CHECK_FLAG_VALUE(COMPRBLK);
+	CHECK_FLAG_VALUE(NOCOMPR);
+	CHECK_FLAG_VALUE(ECOMPR);
+	CHECK_FLAG_VALUE(INDEX);
+	CHECK_FLAG_VALUE(IMAGIC);
+	CHECK_FLAG_VALUE(JOURNAL_DATA);
+	CHECK_FLAG_VALUE(NOTAIL);
+	CHECK_FLAG_VALUE(DIRSYNC);
+	CHECK_FLAG_VALUE(TOPDIR);
+	CHECK_FLAG_VALUE(HUGE_FILE);
+	CHECK_FLAG_VALUE(EXTENTS);
+	CHECK_FLAG_VALUE(EA_INODE);
+	CHECK_FLAG_VALUE(EOFBLOCKS);
+	CHECK_FLAG_VALUE(INLINE_DATA);
+	CHECK_FLAG_VALUE(RESERVED);
+}
+
+/* Used to pass group descriptor data when online resize is done */
+struct ext4_new_group_input {
+	__u32 group;		/* Group number for this data */
+	__u64 block_bitmap;	/* Absolute block number of block bitmap */
+	__u64 inode_bitmap;	/* Absolute block number of inode bitmap */
+	__u64 inode_table;	/* Absolute block number of inode table start */
+	__u32 blocks_count;	/* Total number of blocks in this group */
+	__u16 reserved_blocks;	/* Number of reserved blocks in this group */
+	__u16 unused;
+};
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+	u32 group;
+	compat_u64 block_bitmap;
+	compat_u64 inode_bitmap;
+	compat_u64 inode_table;
+	u32 blocks_count;
+	u16 reserved_blocks;
+	u16 unused;
+};
+#endif
+
+/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
+struct ext4_new_group_data {
+	__u32 group;
+	__u64 block_bitmap;
+	__u64 inode_bitmap;
+	__u64 inode_table;
+	__u32 blocks_count;
+	__u16 reserved_blocks;
+	__u16 unused;
+	__u32 free_blocks_count;
+};
+
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+	BLOCK_BITMAP = 0,	/* block bitmap */
+	INODE_BITMAP,		/* inode bitmap */
+	INODE_TABLE,		/* inode tables */
+	GROUP_TABLE_COUNT,
+};
+
+/*
+ * Flags used by ext4_map_blocks()
+ */
+	/* Allocate any needed blocks and/or convert an unwritten
+	   extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE			0x0001
+	/* Request the creation of an unwritten extent */
+#define EXT4_GET_BLOCKS_UNWRIT_EXT		0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT	(EXT4_GET_BLOCKS_UNWRIT_EXT|\
+						 EXT4_GET_BLOCKS_CREATE)
+	/* Caller is from the delayed allocation writeout path
+	 * finally doing the actual allocation of delayed blocks */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
+	/* caller is from the direct IO path, request to creation of an
+	unwritten extents if not allocated, split the unwritten
+	extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_PRE_IO			0x0008
+#define EXT4_GET_BLOCKS_CONVERT			0x0010
+#define EXT4_GET_BLOCKS_IO_CREATE_EXT		(EXT4_GET_BLOCKS_PRE_IO|\
+					 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+	/* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
+					 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
+	/* Eventual metadata allocation (due to growing extent tree)
+	 * should not fail, so try to use reserved blocks for that.*/
+#define EXT4_GET_BLOCKS_METADATA_NOFAIL		0x0020
+	/* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
+	/* Request will not result in inode size update (user for fallocate) */
+#define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
+	/* Do not take i_data_sem locking in ext4_map_blocks */
+#define EXT4_GET_BLOCKS_NO_LOCK			0x0100
+	/* Do not put hole in extent cache */
+#define EXT4_GET_BLOCKS_NO_PUT_HOLE		0x0200
+	/* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0400
+
+/*
+ * The bit position of these flags must not overlap with any of the
+ * EXT4_GET_BLOCKS_*.  They are used by ext4_ext_find_extent(),
+ * read_extent_tree_block(), ext4_split_extent_at(),
+ * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
+ * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
+ * caching the extents when reading from the extent tree while a
+ * truncate or punch hole operation is in progress.
+ */
+#define EXT4_EX_NOCACHE				0x0400
+#define EXT4_EX_FORCE_CACHE			0x0800
+
+/*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA	0x0001
+#define EXT4_FREE_BLOCKS_FORGET		0x0002
+#define EXT4_FREE_BLOCKS_VALIDATED	0x0004
+#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE	0x0008
+#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER	0x0010
+#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER	0x0020
+#define EXT4_FREE_BLOCKS_RESERVE		0x0040
+
+/*
+ * ioctl commands
+ */
+#define	EXT4_IOC_GETFLAGS		FS_IOC_GETFLAGS
+#define	EXT4_IOC_SETFLAGS		FS_IOC_SETFLAGS
+#define	EXT4_IOC_GETVERSION		_IOR('f', 3, long)
+#define	EXT4_IOC_SETVERSION		_IOW('f', 4, long)
+#define	EXT4_IOC_GETVERSION_OLD		FS_IOC_GETVERSION
+#define	EXT4_IOC_SETVERSION_OLD		FS_IOC_SETVERSION
+#define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
+#define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
+#define EXT4_IOC_GROUP_EXTEND		_IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD		_IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE		_IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
+#define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
+#define EXT4_IOC_SWAP_BOOT		_IO('f', 17)
+#define EXT4_IOC_PRECACHE_EXTENTS	_IO('f', 18)
+
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+/*
+ * ioctl commands in 32 bit emulation
+ */
+#define EXT4_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define EXT4_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define EXT4_IOC32_GETVERSION		_IOR('f', 3, int)
+#define EXT4_IOC32_SETVERSION		_IOW('f', 4, int)
+#define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
+#define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
+#define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD		_IOW('f', 8, struct compat_ext4_new_group_input)
+#define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
+#define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
+#endif
+
+/* Max physical block we can address w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS	0xFFFFFFFF
+
+/*
+ * Structure of an inode on the disk
+ */
+struct ext4_inode {
+	__le16	i_mode;		/* File mode */
+	__le16	i_uid;		/* Low 16 bits of Owner Uid */
+	__le32	i_size_lo;	/* Size in bytes */
+	__le32	i_atime;	/* Access time */
+	__le32	i_ctime;	/* Inode Change time */
+	__le32	i_mtime;	/* Modification time */
+	__le32	i_dtime;	/* Deletion Time */
+	__le16	i_gid;		/* Low 16 bits of Group Id */
+	__le16	i_links_count;	/* Links count */
+	__le32	i_blocks_lo;	/* Blocks count */
+	__le32	i_flags;	/* File flags */
+	union {
+		struct {
+			__le32  l_i_version;
+		} linux1;
+		struct {
+			__u32  h_i_translator;
+		} hurd1;
+		struct {
+			__u32  m_i_reserved1;
+		} masix1;
+	} osd1;				/* OS dependent 1 */
+	__le32	i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
+	__le32	i_generation;	/* File version (for NFS) */
+	__le32	i_file_acl_lo;	/* File ACL */
+	__le32	i_size_high;
+	__le32	i_obso_faddr;	/* Obsoleted fragment address */
+	union {
+		struct {
+			__le16	l_i_blocks_high; /* were l_i_reserved1 */
+			__le16	l_i_file_acl_high;
+			__le16	l_i_uid_high;	/* these 2 fields */
+			__le16	l_i_gid_high;	/* were reserved2[0] */
+			__le16	l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
+			__le16	l_i_reserved;
+		} linux2;
+		struct {
+			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
+			__u16	h_i_mode_high;
+			__u16	h_i_uid_high;
+			__u16	h_i_gid_high;
+			__u32	h_i_author;
+		} hurd2;
+		struct {
+			__le16	h_i_reserved1;	/* Obsoleted fragment number/size which are removed in ext4 */
+			__le16	m_i_file_acl_high;
+			__u32	m_i_reserved2[2];
+		} masix2;
+	} osd2;				/* OS dependent 2 */
+	__le16	i_extra_isize;
+	__le16	i_checksum_hi;	/* crc32c(uuid+inum+inode) BE */
+	__le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
+	__le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
+	__le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
+	__le32  i_crtime;       /* File Creation time */
+	__le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
+	__le32  i_version_hi;	/* high 32 bits for 64-bit version */
+};
+
+struct move_extent {
+	__u32 reserved;		/* should be zero */
+	__u32 donor_fd;		/* donor file descriptor */
+	__u64 orig_start;	/* logical start offset in block for orig */
+	__u64 donor_start;	/* logical start offset in block for donor */
+	__u64 len;		/* block length to be moved */
+	__u64 moved_len;	/* moved block length */
+};
+
+#define EXT4_EPOCH_BITS 2
+#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
+#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
+
+/*
+ * Extended fields will fit into an inode if the filesystem was formatted
+ * with large inodes (-I 256 or larger) and there are not currently any EAs
+ * consuming all of the available space. For new inodes we always reserve
+ * enough space for the kernel's known extended fields, but for inodes
+ * created with an old kernel this might not have been the case. None of
+ * the extended inode fields is critical for correct filesystem operation.
+ * This macro checks if a certain field fits in the inode. Note that
+ * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
+ */
+#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)	\
+	((offsetof(typeof(*ext4_inode), field) +	\
+	  sizeof((ext4_inode)->field))			\
+	<= (EXT4_GOOD_OLD_INODE_SIZE +			\
+	    (einode)->i_extra_isize))			\
+
+static inline __le32 ext4_encode_extra_time(struct timespec *time)
+{
+       return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
+			   (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+                          ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
+}
+
+static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+{
+       if (sizeof(time->tv_sec) > 4)
+	       time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
+			       << 32;
+       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+}
+
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec);	       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(inode)->xtime);       \
+} while (0)
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec);      \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		(raw_inode)->xtime ## _extra =				       \
+				ext4_encode_extra_time(&(einode)->xtime);      \
+} while (0)
+
+#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode)			       \
+do {									       \
+	(inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime);       \
+	if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra))     \
+		ext4_decode_extra_time(&(inode)->xtime,			       \
+				       raw_inode->xtime ## _extra);	       \
+	else								       \
+		(inode)->xtime.tv_nsec = 0;				       \
+} while (0)
+
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)			       \
+do {									       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
+		(einode)->xtime.tv_sec = 				       \
+			(signed)le32_to_cpu((raw_inode)->xtime);	       \
+	else								       \
+		(einode)->xtime.tv_sec = 0;				       \
+	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
+		ext4_decode_extra_time(&(einode)->xtime,		       \
+				       raw_inode->xtime ## _extra);	       \
+	else								       \
+		(einode)->xtime.tv_nsec = 0;				       \
+} while (0)
+
+#define i_disk_version osd1.linux1.l_i_version
+
+#if defined(__KERNEL__) || defined(__linux__)
+#define i_reserved1	osd1.linux1.l_i_reserved1
+#define i_file_acl_high	osd2.linux2.l_i_file_acl_high
+#define i_blocks_high	osd2.linux2.l_i_blocks_high
+#define i_uid_low	i_uid
+#define i_gid_low	i_gid
+#define i_uid_high	osd2.linux2.l_i_uid_high
+#define i_gid_high	osd2.linux2.l_i_gid_high
+#define i_checksum_lo	osd2.linux2.l_i_checksum_lo
+
+#elif defined(__GNU__)
+
+#define i_translator	osd1.hurd1.h_i_translator
+#define i_uid_high	osd2.hurd2.h_i_uid_high
+#define i_gid_high	osd2.hurd2.h_i_gid_high
+#define i_author	osd2.hurd2.h_i_author
+
+#elif defined(__masix__)
+
+#define i_reserved1	osd1.masix1.m_i_reserved1
+#define i_file_acl_high	osd2.masix2.m_i_file_acl_high
+#define i_reserved2	osd2.masix2.m_i_reserved2
+
+#endif /* defined(__KERNEL__) || defined(__linux__) */
+
+#include "extents_status.h"
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+	__le32	i_data[15];	/* unconverted */
+	__u32	i_dtime;
+	ext4_fsblk_t	i_file_acl;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is ued for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	ext4_group_t	i_block_group;
+	ext4_lblk_t	i_dir_start_lookup;
+#if (BITS_PER_LONG < 64)
+	unsigned long	i_state_flags;		/* Dynamic state flags */
+#endif
+	unsigned long	i_flags;
+
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+
+	struct list_head i_orphan;	/* unlinked but open inodes */
+
+	/*
+	 * i_disksize keeps track of what the inode size is ON DISK, not
+	 * in memory.  During truncate, i_size is set to the new size by
+	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
+	 * set i_disksize to 0 until the truncate is actually under way.
+	 *
+	 * The intent is that i_disksize always represents the blocks which
+	 * are used by this file.  This allows recovery to restart truncate
+	 * on orphans if we crash during truncate.  We actually write i_disksize
+	 * into the on-disk inode when writing inodes out, instead of i_size.
+	 *
+	 * The only time when i_disksize and i_size may be different is when
+	 * a truncate is in progress.  The only things which change i_disksize
+	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+	 */
+	loff_t	i_disksize;
+
+	/*
+	 * i_data_sem is for serialising ext4_truncate() against
+	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+	 * data tree are chopped off during truncate. We can't do that in
+	 * ext4 because whenever we perform intermediate commits during
+	 * truncate, the inode and all the metadata blocks *must* be in a
+	 * consistent state which allows truncation of the orphans to restart
+	 * during recovery.  Hence we must fix the get_block-vs-truncate race
+	 * by other means, so we have i_data_sem.
+	 */
+	struct rw_semaphore i_data_sem;
+	struct inode vfs_inode;
+	struct jbd2_inode *jinode;
+
+	spinlock_t i_raw_lock;	/* protects updates to the raw inode */
+
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
+	struct timespec i_crtime;
+
+	/* mballoc */
+	struct list_head i_prealloc_list;
+	spinlock_t i_prealloc_lock;
+
+	/* extents status tree */
+	struct ext4_es_tree i_es_tree;
+	rwlock_t i_es_lock;
+	struct list_head i_es_lru;
+	unsigned int i_es_lru_nr;	/* protected by i_es_lock */
+	unsigned long i_touch_when;	/* jiffies of last accessing */
+
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
+	/* allocation reservation info for delalloc */
+	/* In case of bigalloc, these refer to clusters rather than blocks */
+	unsigned int i_reserved_data_blocks;
+	unsigned int i_reserved_meta_blocks;
+	unsigned int i_allocated_meta_blocks;
+	ext4_lblk_t i_da_metadata_calc_last_lblock;
+	int i_da_metadata_calc_len;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+
+	/* Indicate the inline data space. */
+	u16 i_inline_off;
+	u16 i_inline_size;
+
+#ifdef CONFIG_QUOTA
+	/* quota space reservation, managed internally by quota code */
+	qsize_t i_reserved_quota;
+#endif
+
+	/* Lock protecting lists below */
+	spinlock_t i_completed_io_lock;
+	/*
+	 * Completed IOs that need unwritten extents handling and have
+	 * transaction reserved
+	 */
+	struct list_head i_rsv_conversion_list;
+	/*
+	 * Completed IOs that need unwritten extents handling and don't have
+	 * transaction reserved
+	 */
+	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
+	atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+	struct work_struct i_rsv_conversion_work;
+
+	spinlock_t i_block_reservation_lock;
+
+	/*
+	 * Transactions that contain inode's metadata needed to complete
+	 * fsync and fdatasync, respectively.
+	 */
+	tid_t i_sync_tid;
+	tid_t i_datasync_tid;
+
+	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
+	__u32 i_csum_seed;
+};
+
+/*
+ * File system states
+ */
+#define	EXT4_VALID_FS			0x0001	/* Unmounted cleanly */
+#define	EXT4_ERROR_FS			0x0002	/* Errors detected */
+#define	EXT4_ORPHAN_FS			0x0004	/* Orphans being recovered */
+
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
+
+/*
+ * Mount flags set via mount options or defaults
+ */
+#define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
+#define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
+#define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
+#define EXT4_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */
+#define EXT4_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
+#define EXT4_MOUNT_ERRORS_MASK		0x00070
+#define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
+#define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
+#define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
+#define EXT4_MOUNT_JOURNAL_DATA		0x00400	/* Write data to journal */
+#define EXT4_MOUNT_ORDERED_DATA		0x00800	/* Flush data before commit */
+#define EXT4_MOUNT_WRITEBACK_DATA	0x00C00	/* No data ordering */
+#define EXT4_MOUNT_UPDATE_JOURNAL	0x01000	/* Update the journal format */
+#define EXT4_MOUNT_NO_UID32		0x02000  /* Disable 32-bit UIDs */
+#define EXT4_MOUNT_XATTR_USER		0x04000	/* Extended user attributes */
+#define EXT4_MOUNT_POSIX_ACL		0x08000	/* POSIX Access Control Lists */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC	0x10000	/* No auto delalloc mapping */
+#define EXT4_MOUNT_BARRIER		0x20000 /* Use block barriers */
+#define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
+#define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
+#define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
+#define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
+#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD		0x40000000 /* Issue DISCARD requests */
+#define EXT4_MOUNT_INIT_INODE_TABLE	0x80000000 /* Initialize uninitialized itables */
+
+/*
+ * Mount flags set either automatically (could not be set by mount option)
+ * based on per file system feature or property or in special cases such as
+ * distinguishing between explicit mount option definition and default.
+ */
+#define EXT4_MOUNT2_EXPLICIT_DELALLOC	0x00000001 /* User explicitly
+						      specified delalloc */
+#define EXT4_MOUNT2_STD_GROUP_SIZE	0x00000002 /* We have standard group
+						      size of blocksize * 8
+						      blocks */
+#define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated
+						      file systems */
+
+#define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
+						~EXT4_MOUNT_##opt
+#define set_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt |= \
+						EXT4_MOUNT_##opt
+#define test_opt(sb, opt)		(EXT4_SB(sb)->s_mount_opt & \
+					 EXT4_MOUNT_##opt)
+
+#define clear_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 &= \
+						~EXT4_MOUNT2_##opt
+#define set_opt2(sb, opt)		EXT4_SB(sb)->s_mount_opt2 |= \
+						EXT4_MOUNT2_##opt
+#define test_opt2(sb, opt)		(EXT4_SB(sb)->s_mount_opt2 & \
+					 EXT4_MOUNT2_##opt)
+
+#define ext4_test_and_set_bit		__test_and_set_bit_le
+#define ext4_set_bit			__set_bit_le
+#define ext4_set_bit_atomic		ext2_set_bit_atomic
+#define ext4_test_and_clear_bit		__test_and_clear_bit_le
+#define ext4_clear_bit			__clear_bit_le
+#define ext4_clear_bit_atomic		ext2_clear_bit_atomic
+#define ext4_test_bit			test_bit_le
+#define ext4_find_next_zero_bit		find_next_zero_bit_le
+#define ext4_find_next_bit		find_next_bit_le
+
+extern void ext4_set_bits(void *bm, int cur, int len);
+
+/*
+ * Maximal mount counts between two filesystem checks
+ */
+#define EXT4_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
+#define EXT4_DFL_CHECKINTERVAL		0	/* Don't use interval check */
+
+/*
+ * Behaviour when detecting errors
+ */
+#define EXT4_ERRORS_CONTINUE		1	/* Continue execution */
+#define EXT4_ERRORS_RO			2	/* Remount fs read-only */
+#define EXT4_ERRORS_PANIC		3	/* Panic */
+#define EXT4_ERRORS_DEFAULT		EXT4_ERRORS_CONTINUE
+
+/* Metadata checksum algorithm codes */
+#define EXT4_CRC32C_CHKSUM		1
+
+/*
+ * Structure of the super block
+ */
+struct ext4_super_block {
+/*00*/	__le32	s_inodes_count;		/* Inodes count */
+	__le32	s_blocks_count_lo;	/* Blocks count */
+	__le32	s_r_blocks_count_lo;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_lo;	/* Free blocks count */
+/*10*/	__le32	s_free_inodes_count;	/* Free inodes count */
+	__le32	s_first_data_block;	/* First Data Block */
+	__le32	s_log_block_size;	/* Block size */
+	__le32	s_log_cluster_size;	/* Allocation cluster size */
+/*20*/	__le32	s_blocks_per_group;	/* # Blocks per group */
+	__le32	s_clusters_per_group;	/* # Clusters per group */
+	__le32	s_inodes_per_group;	/* # Inodes per group */
+	__le32	s_mtime;		/* Mount time */
+/*30*/	__le32	s_wtime;		/* Write time */
+	__le16	s_mnt_count;		/* Mount count */
+	__le16	s_max_mnt_count;	/* Maximal mount count */
+	__le16	s_magic;		/* Magic signature */
+	__le16	s_state;		/* File system state */
+	__le16	s_errors;		/* Behaviour when detecting errors */
+	__le16	s_minor_rev_level;	/* minor revision level */
+/*40*/	__le32	s_lastcheck;		/* time of last check */
+	__le32	s_checkinterval;	/* max. time between checks */
+	__le32	s_creator_os;		/* OS */
+	__le32	s_rev_level;		/* Revision level */
+/*50*/	__le16	s_def_resuid;		/* Default uid for reserved blocks */
+	__le16	s_def_resgid;		/* Default gid for reserved blocks */
+	/*
+	 * These fields are for EXT4_DYNAMIC_REV superblocks only.
+	 *
+	 * Note: the difference between the compatible feature set and
+	 * the incompatible feature set is that if there is a bit set
+	 * in the incompatible feature set that the kernel doesn't
+	 * know about, it should refuse to mount the filesystem.
+	 *
+	 * e2fsck's requirements are more strict; if it doesn't know
+	 * about a feature in either the compatible or incompatible
+	 * feature set, it must abort and not try to meddle with
+	 * things it doesn't understand...
+	 */
+	__le32	s_first_ino;		/* First non-reserved inode */
+	__le16  s_inode_size;		/* size of inode structure */
+	__le16	s_block_group_nr;	/* block group # of this superblock */
+	__le32	s_feature_compat;	/* compatible feature set */
+/*60*/	__le32	s_feature_incompat;	/* incompatible feature set */
+	__le32	s_feature_ro_compat;	/* readonly-compatible feature set */
+/*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+/*78*/	char	s_volume_name[16];	/* volume name */
+/*88*/	char	s_last_mounted[64];	/* directory where last mounted */
+/*C8*/	__le32	s_algorithm_usage_bitmap; /* For compression */
+	/*
+	 * Performance hints.  Directory preallocation should only
+	 * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+	 */
+	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
+	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
+	__le16	s_reserved_gdt_blocks;	/* Per group desc for online growth */
+	/*
+	 * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
+	 */
+/*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
+/*E0*/	__le32	s_journal_inum;		/* inode number of journal file */
+	__le32	s_journal_dev;		/* device number of journal file */
+	__le32	s_last_orphan;		/* start of list of inodes to delete */
+	__le32	s_hash_seed[4];		/* HTREE hash seed */
+	__u8	s_def_hash_version;	/* Default hash version to use */
+	__u8	s_jnl_backup_type;
+	__le16  s_desc_size;		/* size of group descriptor */
+/*100*/	__le32	s_default_mount_opts;
+	__le32	s_first_meta_bg;	/* First metablock block group */
+	__le32	s_mkfs_time;		/* When the filesystem was created */
+	__le32	s_jnl_blocks[17];	/* Backup of the journal inode */
+	/* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */
+/*150*/	__le32	s_blocks_count_hi;	/* Blocks count */
+	__le32	s_r_blocks_count_hi;	/* Reserved blocks count */
+	__le32	s_free_blocks_count_hi;	/* Free blocks count */
+	__le16	s_min_extra_isize;	/* All inodes have at least # bytes */
+	__le16	s_want_extra_isize; 	/* New inodes should reserve # bytes */
+	__le32	s_flags;		/* Miscellaneous flags */
+	__le16  s_raid_stride;		/* RAID stride */
+	__le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
+	__le64  s_mmp_block;            /* Block for multi-mount protection */
+	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
+	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
+	__u8	s_checksum_type;	/* metadata checksum algorithm used */
+	__le16  s_reserved_pad;
+	__le64	s_kbytes_written;	/* nr of lifetime kilobytes written */
+	__le32	s_snapshot_inum;	/* Inode number of active snapshot */
+	__le32	s_snapshot_id;		/* sequential ID of active snapshot */
+	__le64	s_snapshot_r_blocks_count; /* reserved blocks for active
+					      snapshot's future use */
+	__le32	s_snapshot_list;	/* inode number of the head of the
+					   on-disk snapshot list */
+#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
+	__le32	s_error_count;		/* number of fs errors */
+	__le32	s_first_error_time;	/* first time an error happened */
+	__le32	s_first_error_ino;	/* inode involved in first error */
+	__le64	s_first_error_block;	/* block involved of first error */
+	__u8	s_first_error_func[32];	/* function where the error happened */
+	__le32	s_first_error_line;	/* line number where error happened */
+	__le32	s_last_error_time;	/* most recent time of an error */
+	__le32	s_last_error_ino;	/* inode involved in last error */
+	__le32	s_last_error_line;	/* line number where error happened */
+	__le64	s_last_error_block;	/* block involved of last error */
+	__u8	s_last_error_func[32];	/* function where the error happened */
+#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
+	__u8	s_mount_opts[64];
+	__le32	s_usr_quota_inum;	/* inode for tracking user quota */
+	__le32	s_grp_quota_inum;	/* inode for tracking group quota */
+	__le32	s_overhead_clusters;	/* overhead blocks/clusters in fs */
+	__le32	s_backup_bgs[2];	/* groups with sparse_super2 SBs */
+	__le32	s_reserved[106];	/* Padding to the end of the block */
+	__le32	s_checksum;		/* crc32c(superblock) */
+};
+
+#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
+
+#ifdef __KERNEL__
+
+/*
+ * run-time mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED	0x0001
+#define EXT4_MF_FS_ABORTED	0x0002	/* Fatal error detected */
+
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
+	unsigned long s_inodes_per_block;/* Number of inodes per block */
+	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_clusters_per_group; /* Number of clusters in a group */
+	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
+	unsigned long s_overhead;  /* # of fs overhead clusters */
+	unsigned int s_cluster_ratio;	/* Number of blocks per cluster */
+	unsigned int s_cluster_bits;	/* log2 of s_cluster_ratio */
+	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
+	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head **s_group_desc;
+	unsigned int s_mount_opt;
+	unsigned int s_mount_opt2;
+	unsigned int s_mount_flags;
+	unsigned int s_def_mount_opt;
+	ext4_fsblk_t s_sb_block;
+	atomic64_t s_resv_clusters;
+	kuid_t s_resuid;
+	kgid_t s_resgid;
+	unsigned short s_mount_state;
+	unsigned short s_pad;
+	int s_addr_per_block_bits;
+	int s_desc_per_block_bits;
+	int s_inode_size;
+	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
+	unsigned int s_inode_goal;
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+	u32 s_hash_seed[4];
+	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
+	struct percpu_counter s_freeclusters_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct percpu_counter s_dirtyclusters_counter;
+	struct blockgroup_lock *s_blockgroup_lock;
+	struct proc_dir_entry *s_proc;
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
+	struct super_block *s_sb;
+
+	/* Journaling */
+	struct journal_s *s_journal;
+	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
+	unsigned long s_resize_flags;		/* Flags indicating if there
+						   is a resizer */
+	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
+	struct block_device *journal_bdev;
+#ifdef CONFIG_QUOTA
+	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
+	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+	struct rb_root system_blks;
+
+#ifdef EXTENTS_STATS
+	/* ext4 extents stats */
+	unsigned long s_ext_min;
+	unsigned long s_ext_max;
+	unsigned long s_depth_max;
+	spinlock_t s_ext_stats_lock;
+	unsigned long s_ext_blocks;
+	unsigned long s_ext_extents;
+#endif
+
+	/* for buddy allocator */
+	struct ext4_group_info ***s_group_info;
+	struct inode *s_buddy_cache;
+	spinlock_t s_md_lock;
+	unsigned short *s_mb_offsets;
+	unsigned int *s_mb_maxs;
+	unsigned int s_group_info_size;
+
+	/* tunables */
+	unsigned long s_stripe;
+	unsigned int s_mb_stream_request;
+	unsigned int s_mb_max_to_scan;
+	unsigned int s_mb_min_to_scan;
+	unsigned int s_mb_stats;
+	unsigned int s_mb_order2_reqs;
+	unsigned int s_mb_group_prealloc;
+	unsigned int s_max_dir_size_kb;
+	/* where last allocation was done - for stream allocation */
+	unsigned long s_mb_last_group;
+	unsigned long s_mb_last_start;
+
+	/* stats for buddy allocator */
+	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
+	atomic_t s_bal_success;	/* we found long enough chunks */
+	atomic_t s_bal_allocated;	/* in blocks */
+	atomic_t s_bal_ex_scanned;	/* total extents scanned */
+	atomic_t s_bal_goals;	/* goal hits */
+	atomic_t s_bal_breaks;	/* too long searches */
+	atomic_t s_bal_2orders;	/* 2^order hits */
+	spinlock_t s_bal_lock;
+	unsigned long s_mb_buddies_generated;
+	unsigned long long s_mb_generation_time;
+	atomic_t s_mb_lost_chunks;
+	atomic_t s_mb_preallocated;
+	atomic_t s_mb_discarded;
+	atomic_t s_lock_busy;
+
+	/* locality groups */
+	struct ext4_locality_group __percpu *s_locality_groups;
+
+	/* for write statistics */
+	unsigned long s_sectors_written_start;
+	u64 s_kbytes_written;
+
+	/* the size of zero-out chunk */
+	unsigned int s_extent_max_zeroout_kb;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
+	ext4_group_t s_flex_groups_allocated;
+
+	/* workqueue for reserved extent conversions (buffered io) */
+	struct workqueue_struct *rsv_conversion_wq;
+
+	/* timer for periodic error stats printing */
+	struct timer_list s_err_report;
+
+	/* Lazy inode table initialization info */
+	struct ext4_li_request *s_li_request;
+	/* Wait multiplier for lazy initialization thread */
+	unsigned int s_li_wait_mult;
+
+	/* Kernel thread for multiple mount protection */
+	struct task_struct *s_mmp_tsk;
+
+	/* record the last minlen when FITRIM is called. */
+	atomic_t s_last_trim_minblks;
+
+	/* Reference to checksum algorithm driver via cryptoapi */
+	struct crypto_shash *s_chksum_driver;
+
+	/* Precomputed FS UUID checksum for seeding other checksums */
+	__u32 s_csum_seed;
+
+	/* Reclaim extents from extent status tree */
+	struct shrinker s_es_shrinker;
+	struct list_head s_es_lru;
+	unsigned long s_es_last_sorted;
+	struct percpu_counter s_extent_cache_cnt;
+	struct mb_cache *s_mb_cache;
+	spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+
+	/* Ratelimit ext4 messages. */
+	struct ratelimit_state s_err_ratelimit_state;
+	struct ratelimit_state s_warning_ratelimit_state;
+	struct ratelimit_state s_msg_ratelimit_state;
+};
+
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
+{
+	return container_of(inode, struct ext4_inode_info, vfs_inode);
+}
+
+static inline struct timespec ext4_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
+{
+	return ino == EXT4_ROOT_INO ||
+		ino == EXT4_USR_QUOTA_INO ||
+		ino == EXT4_GRP_QUOTA_INO ||
+		ino == EXT4_BOOT_LOADER_INO ||
+		ino == EXT4_JOURNAL_INO ||
+		ino == EXT4_RESIZE_INO ||
+		(ino >= EXT4_FIRST_INO(sb) &&
+		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
+}
+
+static inline void ext4_set_io_unwritten_flag(struct inode *inode,
+					      struct ext4_io_end *io_end)
+{
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		io_end->flag |= EXT4_IO_END_UNWRITTEN;
+		atomic_inc(&EXT4_I(inode)->i_unwritten);
+	}
+}
+
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+	return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+	inode->i_private = io;
+}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+	EXT4_STATE_JDATA,		/* journaled data exists */
+	EXT4_STATE_NEW,			/* inode is newly created */
+	EXT4_STATE_XATTR,		/* has in-inode xattrs */
+	EXT4_STATE_NO_EXPAND,		/* No space for expansion */
+	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
+	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
+	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
+	EXT4_STATE_NEWENTRY,		/* File just added to dir */
+	EXT4_STATE_DELALLOC_RESERVED,	/* blks already reserved for delalloc */
+	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
+					   nolocking */
+	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
+	EXT4_STATE_ORDERED_MODE,	/* data=ordered mode */
+	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
+};
+
+#define EXT4_INODE_BIT_FNS(name, field, offset)				\
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)	\
+{									\
+	return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);	\
+}									\
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)	\
+{									\
+	set_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
+}									\
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{									\
+	clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);		\
+}
+
+/* Add these declarations here only so that these functions can be
+ * found by name.  Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_flag(struct inode *inode, int bit);
+static inline void ext4_set_inode_flag(struct inode *inode, int bit);
+static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
+EXT4_INODE_BIT_FNS(flag, flags, 0)
+
+/* Add these declarations here only so that these functions can be
+ * found by name.  Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_state(struct inode *inode, int bit);
+static inline void ext4_set_inode_state(struct inode *inode, int bit);
+static inline void ext4_clear_inode_state(struct inode *inode, int bit);
+#if (BITS_PER_LONG < 64)
+EXT4_INODE_BIT_FNS(state, state_flags, 0)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+	(ei)->i_state_flags = 0;
+}
+#else
+EXT4_INODE_BIT_FNS(state, flags, 32)
+
+static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
+{
+	/* We depend on the fact that callers will set i_flags */
+}
+#endif
+#else
+/* Assume that user mode programs are passing in an ext4fs superblock, not
+ * a kernel struct super_block.  This will allow us to call the feature-test
+ * macros from user land. */
+#define EXT4_SB(sb)	(sb)
+#endif
+
+#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
+
+/*
+ * Codes for operating systems
+ */
+#define EXT4_OS_LINUX		0
+#define EXT4_OS_HURD		1
+#define EXT4_OS_MASIX		2
+#define EXT4_OS_FREEBSD		3
+#define EXT4_OS_LITES		4
+
+/*
+ * Revision levels
+ */
+#define EXT4_GOOD_OLD_REV	0	/* The good old (original) format */
+#define EXT4_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */
+
+#define EXT4_CURRENT_REV	EXT4_GOOD_OLD_REV
+#define EXT4_MAX_SUPP_REV	EXT4_DYNAMIC_REV
+
+#define EXT4_GOOD_OLD_INODE_SIZE 128
+
+/*
+ * Feature set definitions
+ */
+
+#define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
+	((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+	((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
+#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
+	((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
+#define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+#define EXT4_SET_INCOMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+	EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+
+#define EXT4_FEATURE_COMPAT_DIR_PREALLOC	0x0001
+#define EXT4_FEATURE_COMPAT_IMAGIC_INODES	0x0002
+#define EXT4_FEATURE_COMPAT_HAS_JOURNAL		0x0004
+#define EXT4_FEATURE_COMPAT_EXT_ATTR		0x0008
+#define EXT4_FEATURE_COMPAT_RESIZE_INODE	0x0010
+#define EXT4_FEATURE_COMPAT_DIR_INDEX		0x0020
+#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2	0x0200
+
+#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
+#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
+#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
+#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM		0x0010
+#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK	0x0020
+#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE	0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA		0x0100
+#define EXT4_FEATURE_RO_COMPAT_BIGALLOC		0x0200
+/*
+ * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
+ * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
+ * all other data structures' checksums.  However, the METADATA_CSUM and
+ * GDT_CSUM bits are mutually exclusive.
+ */
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM	0x0400
+
+#define EXT4_FEATURE_INCOMPAT_COMPRESSION	0x0001
+#define EXT4_FEATURE_INCOMPAT_FILETYPE		0x0002
+#define EXT4_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
+#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
+#define EXT4_FEATURE_INCOMPAT_META_BG		0x0010
+#define EXT4_FEATURE_INCOMPAT_EXTENTS		0x0040 /* extents support */
+#define EXT4_FEATURE_INCOMPAT_64BIT		0x0080
+#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
+#define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
+#define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
+#define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM	0x2000 /* use crc32c for bg */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
+#define EXT4_FEATURE_INCOMPAT_INLINE_DATA	0x8000 /* data in inode */
+
+#define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_RECOVER| \
+					 EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
+					 EXT4_FEATURE_INCOMPAT_RECOVER| \
+					 EXT4_FEATURE_INCOMPAT_META_BG| \
+					 EXT4_FEATURE_INCOMPAT_EXTENTS| \
+					 EXT4_FEATURE_INCOMPAT_64BIT| \
+					 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+					 EXT4_FEATURE_INCOMPAT_MMP |	\
+					 EXT4_FEATURE_INCOMPAT_INLINE_DATA)
+#define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+					 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
+					 EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
+					 EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
+					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
+					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
+					 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
+					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
+					 EXT4_FEATURE_RO_COMPAT_QUOTA)
+
+/*
+ * Default values for user and/or group using reserved blocks
+ */
+#define	EXT4_DEF_RESUID		0
+#define	EXT4_DEF_RESGID		0
+
+#define EXT4_DEF_INODE_READAHEAD_BLKS	32
+
+/*
+ * Default mount options
+ */
+#define EXT4_DEFM_DEBUG		0x0001
+#define EXT4_DEFM_BSDGROUPS	0x0002
+#define EXT4_DEFM_XATTR_USER	0x0004
+#define EXT4_DEFM_ACL		0x0008
+#define EXT4_DEFM_UID16		0x0010
+#define EXT4_DEFM_JMODE		0x0060
+#define EXT4_DEFM_JMODE_DATA	0x0020
+#define EXT4_DEFM_JMODE_ORDERED	0x0040
+#define EXT4_DEFM_JMODE_WBACK	0x0060
+#define EXT4_DEFM_NOBARRIER	0x0100
+#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
+#define EXT4_DEFM_DISCARD	0x0400
+#define EXT4_DEFM_NODELALLOC	0x0800
+
+/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME	0
+#define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
+
+/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME	4
+
+/*
+ * Structure of a directory entry
+ */
+#define EXT4_NAME_LEN 255
+
+struct ext4_dir_entry {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__le16	name_len;		/* Name length */
+	char	name[EXT4_NAME_LEN];	/* File name */
+};
+
+/*
+ * The new version of the directory entry.  Since EXT4 structures are
+ * stored in intel byte order, and the name_len field could never be
+ * bigger than 255 chars, it's safe to reclaim the extra byte for the
+ * file_type field.
+ */
+struct ext4_dir_entry_2 {
+	__le32	inode;			/* Inode number */
+	__le16	rec_len;		/* Directory entry length */
+	__u8	name_len;		/* Name length */
+	__u8	file_type;
+	char	name[EXT4_NAME_LEN];	/* File name */
+};
+
+/*
+ * This is a bogus directory entry at the end of each leaf block that
+ * records checksums.
+ */
+struct ext4_dir_entry_tail {
+	__le32	det_reserved_zero1;	/* Pretend to be unused */
+	__le16	det_rec_len;		/* 12 */
+	__u8	det_reserved_zero2;	/* Zero name length */
+	__u8	det_reserved_ft;	/* 0xDE, fake file type */
+	__le32	det_checksum;		/* crc32c(uuid+inum+dirblock) */
+};
+
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+	((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+					((blocksize) - \
+					 sizeof(struct ext4_dir_entry_tail))))
+
+/*
+ * Ext4 directory file types.  Only the low 3 bits are used.  The
+ * other bits are reserved for now.
+ */
+#define EXT4_FT_UNKNOWN		0
+#define EXT4_FT_REG_FILE	1
+#define EXT4_FT_DIR		2
+#define EXT4_FT_CHRDEV		3
+#define EXT4_FT_BLKDEV		4
+#define EXT4_FT_FIFO		5
+#define EXT4_FT_SOCK		6
+#define EXT4_FT_SYMLINK		7
+
+#define EXT4_FT_MAX		8
+
+#define EXT4_FT_DIR_CSUM	0xDE
+
+/*
+ * EXT4_DIR_PAD defines the directory entries boundaries
+ *
+ * NOTE: It must be a multiple of 4
+ */
+#define EXT4_DIR_PAD			4
+#define EXT4_DIR_ROUND			(EXT4_DIR_PAD - 1)
+#define EXT4_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT4_DIR_ROUND) & \
+					 ~EXT4_DIR_ROUND)
+#define EXT4_MAX_REC_LEN		((1<<16)-1)
+
+/*
+ * If we ever get support for fs block sizes > page_size, we'll need
+ * to remove the #if statements in the next two functions...
+ */
+static inline unsigned int
+ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+	unsigned len = le16_to_cpu(dlen);
+
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len == EXT4_MAX_REC_LEN || len == 0)
+		return blocksize;
+	return (len & 65532) | ((len & 3) << 16);
+#else
+	return len;
+#endif
+}
+
+static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+		BUG();
+#if (PAGE_CACHE_SIZE >= 65536)
+	if (len < 65536)
+		return cpu_to_le16(len);
+	if (len == blocksize) {
+		if (blocksize == 65536)
+			return cpu_to_le16(EXT4_MAX_REC_LEN);
+		else
+			return cpu_to_le16(0);
+	}
+	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+#else
+	return cpu_to_le16(len);
+#endif
+}
+
+/*
+ * Hash Tree Directory indexing
+ * (c) Daniel Phillips, 2001
+ */
+
+#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
+				      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+		    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
+#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
+#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+/* Legal values for the dx_root hash_version field: */
+
+#define DX_HASH_LEGACY		0
+#define DX_HASH_HALF_MD4	1
+#define DX_HASH_TEA		2
+#define DX_HASH_LEGACY_UNSIGNED	3
+#define DX_HASH_HALF_MD4_UNSIGNED	4
+#define DX_HASH_TEA_UNSIGNED		5
+
+static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
+			      const void *address, unsigned int length)
+{
+	struct {
+		struct shash_desc shash;
+		char ctx[4];
+	} desc;
+	int err;
+
+	BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver)!=sizeof(desc.ctx));
+
+	desc.shash.tfm = sbi->s_chksum_driver;
+	desc.shash.flags = 0;
+	*(u32 *)desc.ctx = crc;
+
+	err = crypto_shash_update(&desc.shash, address, length);
+	BUG_ON(err);
+
+	return *(u32 *)desc.ctx;
+}
+
+#ifdef __KERNEL__
+
+/* hash info structure used by the directory hash */
+struct dx_hash_info
+{
+	u32		hash;
+	u32		minor_hash;
+	int		hash_version;
+	u32		*seed;
+};
+
+
+/* 32 and 64 bit signed EOF for dx directories */
+#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
+#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)
+
+
+/*
+ * Control parameters used by ext4_htree_next_block
+ */
+#define HASH_NB_ALWAYS		1
+
+
+/*
+ * Describe an inode's exact location on disk and in memory
+ */
+struct ext4_iloc
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	ext4_group_t block_group;
+};
+
+static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
+{
+	return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
+}
+
+/*
+ * This structure is stuffed into the struct file's private_data field
+ * for directories.  It is where we put information so that we can do
+ * readdir operations in hash tree order.
+ */
+struct dir_private_info {
+	struct rb_root	root;
+	struct rb_node	*curr_node;
+	struct fname	*extra_fname;
+	loff_t		last_pos;
+	__u32		curr_hash;
+	__u32		curr_minor_hash;
+	__u32		next_hash;
+};
+
+/* calculate the first block number of the group */
+static inline ext4_fsblk_t
+ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
+{
+	return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+}
+
+/*
+ * Special error return code only used by dx_probe() and its callers.
+ */
+#define ERR_BAD_DX_DIR	-75000
+
+/*
+ * Timeout and state flag for lazy initialization inode thread.
+ */
+#define EXT4_DEF_LI_WAIT_MULT			10
+#define EXT4_DEF_LI_MAX_START_DELAY		5
+#define EXT4_LAZYINIT_QUIT			0x0001
+#define EXT4_LAZYINIT_RUNNING			0x0002
+
+/*
+ * Lazy inode table initialization info
+ */
+struct ext4_lazy_init {
+	unsigned long		li_state;
+	struct list_head	li_request_list;
+	struct mutex		li_list_mtx;
+};
+
+struct ext4_li_request {
+	struct super_block	*lr_super;
+	struct ext4_sb_info	*lr_sbi;
+	ext4_group_t		lr_next_group;
+	struct list_head	lr_request;
+	unsigned long		lr_next_sched;
+	unsigned long		lr_timeout;
+};
+
+struct ext4_features {
+	struct kobject f_kobj;
+	struct completion f_kobj_unregister;
+};
+
+/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+	__le32	mmp_magic;		/* Magic number for MMP */
+	__le32	mmp_seq;		/* Sequence no. updated periodically */
+
+	/*
+	 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+	 * purposes and do not affect the correctness of the algorithm
+	 */
+	__le64	mmp_time;		/* Time last updated */
+	char	mmp_nodename[64];	/* Node which last updated MMP block */
+	char	mmp_bdevname[32];	/* Bdev which last updated MMP block */
+
+	/*
+	 * mmp_check_interval is used to verify if the MMP block has been
+	 * updated on the block device. The value is updated based on the
+	 * maximum time to write the MMP block during an update cycle.
+	 */
+	__le16	mmp_check_interval;
+
+	__le16	mmp_pad1;
+	__le32	mmp_pad2[226];
+	__le32	mmp_checksum;		/* crc32c(uuid+mmp_block) */
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+	struct buffer_head *bh; /* bh from initial read_mmp_block() */
+	struct super_block *sb;  /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT		2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL	5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL	300UL
+
+/*
+ * Function prototypes
+ */
+
+/*
+ * Ok, these declarations are also in <linux/kernel.h> but none of the
+ * ext4 source programs needs to include it so they are duplicated here.
+ */
+# define NORET_TYPE	/**/
+# define ATTRIB_NORET	__attribute__((noreturn))
+# define NORET_AND	noreturn,
+
+/* bitmap.c */
+extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+				struct ext4_group_desc *gdp,
+				struct buffer_head *bh, int sz);
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+				  struct ext4_group_desc *gdp,
+				  struct buffer_head *bh, int sz);
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+				struct ext4_group_desc *gdp,
+				struct buffer_head *bh);
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+				  struct ext4_group_desc *gdp,
+				  struct buffer_head *bh);
+
+/* balloc.c */
+extern void ext4_get_group_no_and_offset(struct super_block *sb,
+					 ext4_fsblk_t blocknr,
+					 ext4_group_t *blockgrpp,
+					 ext4_grpblk_t *offsetp);
+extern ext4_group_t ext4_get_group_number(struct super_block *sb,
+					  ext4_fsblk_t block);
+
+extern unsigned int ext4_block_group(struct super_block *sb,
+			ext4_fsblk_t blocknr);
+extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
+			ext4_fsblk_t blocknr);
+extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
+extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
+			ext4_group_t group);
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
+					 ext4_fsblk_t goal,
+					 unsigned int flags,
+					 unsigned long *count,
+					 int *errp);
+extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
+				    s64 nclusters, unsigned int flags);
+extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
+extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+						    ext4_group_t block_group,
+						    struct buffer_head ** bh);
+extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+
+extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
+						ext4_group_t block_group);
+extern int ext4_wait_block_bitmap(struct super_block *sb,
+				  ext4_group_t block_group,
+				  struct buffer_head *bh);
+extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+						  ext4_group_t block_group);
+extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
+					      ext4_group_t block_group,
+					      struct ext4_group_desc *gdp);
+ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
+
+/* dir.c */
+extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
+				  struct file *,
+				  struct ext4_dir_entry_2 *,
+				  struct buffer_head *, char *, int,
+				  unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset)	\
+	unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
+					(de), (bh), (buf), (size), (offset)))
+extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+				    __u32 minor_hash,
+				    struct ext4_dir_entry_2 *dirent);
+extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+			     struct buffer_head *bh,
+			     void *buf, int buf_size,
+			     const char *name, int namelen,
+			     struct ext4_dir_entry_2 **dest_de);
+void ext4_insert_dentry(struct inode *inode,
+			struct ext4_dir_entry_2 *de,
+			int buf_size,
+			const char *name, int namelen);
+static inline void ext4_update_dx_flag(struct inode *inode)
+{
+	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+				     EXT4_FEATURE_COMPAT_DIR_INDEX))
+		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+static unsigned char ext4_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+	    (filetype >= EXT4_FT_MAX))
+		return DT_UNKNOWN;
+
+	return ext4_filetype_table[filetype];
+}
+
+/* fsync.c */
+extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+
+/* hash.c */
+extern int ext4fs_dirhash(const char *name, int len, struct
+			  dx_hash_info *hinfo);
+
+/* ialloc.c */
+extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
+				      const struct qstr *qstr, __u32 goal,
+				      uid_t *owner, int handle_type,
+				      unsigned int line_no, int nblocks);
+
+#define ext4_new_inode(handle, dir, mode, qstr, goal, owner) \
+	__ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \
+			 0, 0, 0)
+#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \
+				    type, nblocks)		    \
+	__ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \
+			 (type), __LINE__, (nblocks))
+
+
+extern void ext4_free_inode(handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
+extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
+extern int ext4_init_inode_table(struct super_block *sb,
+				 ext4_group_t group, int barrier);
+extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
+
+/* mballoc.c */
+extern long ext4_mb_stats;
+extern long ext4_mb_max_to_scan;
+extern int ext4_mb_init(struct super_block *);
+extern int ext4_mb_release(struct super_block *);
+extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
+				struct ext4_allocation_request *, int *);
+extern int ext4_mb_reserve_blocks(struct super_block *, int);
+extern void ext4_discard_preallocations(struct inode *);
+extern int __init ext4_init_mballoc(void);
+extern void ext4_exit_mballoc(void);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+			     struct buffer_head *bh, ext4_fsblk_t block,
+			     unsigned long count, int flags);
+extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
+				   ext4_group_t ngroups);
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
+		ext4_group_t i, struct ext4_group_desc *desc);
+extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count);
+extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
+
+/* inode.c */
+struct buffer_head *ext4_getblk(handle_t *, struct inode *,
+						ext4_lblk_t, int, int *);
+struct buffer_head *ext4_bread(handle_t *, struct inode *,
+						ext4_lblk_t, int, int *);
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+			 struct buffer_head *bh_result, int create);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create);
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int create);
+int ext4_walk_page_buffers(handle_t *handle,
+			   struct buffer_head *head,
+			   unsigned from,
+			   unsigned to,
+			   int *partial,
+			   int (*fn)(handle_t *handle,
+				     struct buffer_head *bh));
+int do_journal_get_write_access(handle_t *handle,
+				struct buffer_head *bh);
+#define FALL_BACK_TO_NONDELALLOC 1
+#define CONVERT_INLINE_DATA	 2
+
+extern struct inode *ext4_iget(struct super_block *, unsigned long);
+extern int  ext4_write_inode(struct inode *, struct writeback_control *);
+extern int  ext4_setattr(struct dentry *, struct iattr *);
+extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				struct kstat *stat);
+extern void ext4_evict_inode(struct inode *);
+extern void ext4_clear_inode(struct inode *);
+extern int  ext4_sync_inode(handle_t *, struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
+extern int ext4_change_inode_journal_flag(struct inode *, int);
+extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_inode_attach_jinode(struct inode *inode);
+extern int ext4_can_truncate(struct inode *inode);
+extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
+extern void ext4_set_inode_flags(struct inode *);
+extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
+extern void ext4_set_aops(struct inode *inode);
+extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t lend);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern void ext4_da_update_reserve_space(struct inode *inode,
+					int used, int quota_claim);
+
+/* indirect.c */
+extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+				struct ext4_map_blocks *map, int flags);
+extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+				struct iov_iter *iter, loff_t offset);
+extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
+extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
+extern void ext4_ind_truncate(handle_t *, struct inode *inode);
+extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+				 ext4_lblk_t first, ext4_lblk_t stop);
+
+/* ioctl.c */
+extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+
+/* migrate.c */
+extern int ext4_ext_migrate(struct inode *);
+extern int ext4_ind_migrate(struct inode *inode);
+
+/* namei.c */
+extern int ext4_dirent_csum_verify(struct inode *inode,
+				   struct ext4_dir_entry *dirent);
+extern int ext4_orphan_add(handle_t *, struct inode *);
+extern int ext4_orphan_del(handle_t *, struct inode *);
+extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+				__u32 start_minor_hash, __u32 *next_hash);
+extern int search_dir(struct buffer_head *bh,
+		      char *search_buf,
+		      int buf_size,
+		      struct inode *dir,
+		      const struct qstr *d_name,
+		      unsigned int offset,
+		      struct ext4_dir_entry_2 **res_dir);
+extern int ext4_generic_delete_entry(handle_t *handle,
+				     struct inode *dir,
+				     struct ext4_dir_entry_2 *de_del,
+				     struct buffer_head *bh,
+				     void *entry_buf,
+				     int buf_size,
+				     int csum_size);
+
+/* resize.c */
+extern int ext4_group_add(struct super_block *sb,
+				struct ext4_new_group_data *input);
+extern int ext4_group_extend(struct super_block *sb,
+				struct ext4_super_block *es,
+				ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
+
+/* super.c */
+extern int ext4_calculate_overhead(struct super_block *sb);
+extern void ext4_superblock_csum_set(struct super_block *sb);
+extern void *ext4_kvmalloc(size_t size, gfp_t flags);
+extern void *ext4_kvzalloc(size_t size, gfp_t flags);
+extern void ext4_kvfree(void *ptr);
+extern int ext4_alloc_flex_bg_array(struct super_block *sb,
+				    ext4_group_t ngroup);
+extern const char *ext4_decode_error(struct super_block *sb, int errno,
+				     char nbuf[16]);
+
+extern __printf(4, 5)
+void __ext4_error(struct super_block *, const char *, unsigned int,
+		  const char *, ...);
+extern __printf(5, 6)
+void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
+		      const char *, ...);
+extern __printf(5, 6)
+void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
+		     const char *, ...);
+extern void __ext4_std_error(struct super_block *, const char *,
+			     unsigned int, int);
+extern __printf(4, 5)
+void __ext4_abort(struct super_block *, const char *, unsigned int,
+		  const char *, ...);
+extern __printf(4, 5)
+void __ext4_warning(struct super_block *, const char *, unsigned int,
+		    const char *, ...);
+extern __printf(3, 4)
+void __ext4_msg(struct super_block *, const char *, const char *, ...);
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+			   const char *, unsigned int, const char *);
+extern __printf(7, 8)
+void __ext4_grp_locked_error(const char *, unsigned int,
+			     struct super_block *, ext4_group_t,
+			     unsigned long, ext4_fsblk_t,
+			     const char *, ...);
+
+#ifdef CONFIG_PRINTK
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)		\
+	__ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error_file(file, func, line, block, fmt, ...)		\
+	__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
+#define ext4_error(sb, fmt, ...)					\
+	__ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_abort(sb, fmt, ...)					\
+	__ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_warning(sb, fmt, ...)					\
+	__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
+#define ext4_msg(sb, level, fmt, ...)				\
+	__ext4_msg(sb, level, fmt, ##__VA_ARGS__)
+#define dump_mmp_msg(sb, mmp, msg)					\
+	__dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
+	__ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
+				fmt, ##__VA_ARGS__)
+
+#else
+
+#define ext4_error_inode(inode, func, line, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error_inode(inode, "", 0, block, " ");			\
+} while (0)
+#define ext4_error_file(file, func, line, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error_file(file, "", 0, block, " ");			\
+} while (0)
+#define ext4_error(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_error(sb, "", 0, " ");					\
+} while (0)
+#define ext4_abort(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_abort(sb, "", 0, " ");					\
+} while (0)
+#define ext4_warning(sb, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_warning(sb, "", 0, " ");					\
+} while (0)
+#define ext4_msg(sb, level, fmt, ...)					\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);					\
+	__ext4_msg(sb, "", " ");					\
+} while (0)
+#define dump_mmp_msg(sb, mmp, msg)					\
+	__dump_mmp_msg(sb, mmp, "", 0, "")
+#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)		\
+do {									\
+	no_printk(fmt, ##__VA_ARGS__);				\
+	__ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");	\
+} while (0)
+
+#endif
+
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
+					__u32 compat);
+extern int ext4_update_rocompat_feature(handle_t *handle,
+					struct super_block *sb,	__u32 rocompat);
+extern int ext4_update_incompat_feature(handle_t *handle,
+					struct super_block *sb,	__u32 incompat);
+extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+				     struct ext4_group_desc *bg);
+extern __u32 ext4_free_group_clusters(struct super_block *sb,
+				      struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+				 struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+				struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+				   struct ext4_group_desc *bg);
+extern void ext4_block_bitmap_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_bitmap_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_inode_table_set(struct super_block *sb,
+				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_group_clusters_set(struct super_block *sb,
+					 struct ext4_group_desc *bg,
+					 __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+				   struct ext4_group_desc *bg, __u32 count);
+extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
+				       struct ext4_group_desc *gdp);
+extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
+				     struct ext4_group_desc *gdp);
+extern int ext4_register_li_request(struct super_block *sb,
+				    ext4_group_t first_not_zeroed);
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+	return EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					  EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
+					  EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+}
+
+static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_r_blocks_count_lo);
+}
+
+static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
+{
+	return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) |
+		le32_to_cpu(es->s_free_blocks_count_lo);
+}
+
+static inline void ext4_blocks_count_set(struct ext4_super_block *es,
+					 ext4_fsblk_t blk)
+{
+	es->s_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
+					      ext4_fsblk_t blk)
+{
+	es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
+					   ext4_fsblk_t blk)
+{
+	es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
+	es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+}
+
+static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
+{
+	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+			le32_to_cpu(raw_inode->i_size_lo);
+	else
+		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
+}
+
+static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+{
+	raw_inode->i_size_lo = cpu_to_le32(i_size);
+	raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
+}
+
+static inline
+struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
+					    ext4_group_t group)
+{
+	 struct ext4_group_info ***grp_info;
+	 long indexv, indexh;
+	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
+	 grp_info = EXT4_SB(sb)->s_group_info;
+	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
+	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
+	 return grp_info[indexv][indexh];
+}
+
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+	ext4_group_t	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	smp_rmb();
+	return ngroups;
+}
+
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+					     ext4_group_t block_group)
+{
+	return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+	return 1 << sbi->s_log_groups_per_flex;
+}
+
+#define ext4_std_error(sb, errno)				\
+do {								\
+	if ((errno))						\
+		__ext4_std_error((sb), __func__, __LINE__, (errno));	\
+} while (0)
+
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate percpu_counter_batch clusters in their local
+ * counters. So we need to make sure we have free clusters more
+ * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
+#else
+#define EXT4_FREECLUSTERS_WATERMARK 0
+#endif
+
+/* Update i_disksize. Requires i_mutex to avoid races with truncate */
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
+		     !mutex_is_locked(&inode->i_mutex));
+	down_write(&EXT4_I(inode)->i_data_sem);
+	if (newsize > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = newsize;
+	up_write(&EXT4_I(inode)->i_data_sem);
+}
+
+struct ext4_group_info {
+	unsigned long   bb_state;
+	struct rb_root  bb_free_root;
+	ext4_grpblk_t	bb_first_free;	/* first free block */
+	ext4_grpblk_t	bb_free;	/* total free blocks */
+	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
+	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
+	struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+	void            *bb_bitmap;
+#endif
+	struct rw_semaphore alloc_sem;
+	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
+					 * regions, index is order.
+					 * bb_counters[3] = 5 means
+					 * 5 free 8-block regions. */
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT		0
+#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT		1
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT	2
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT	3
+
+#define EXT4_MB_GRP_NEED_INIT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
+
+#define EXT4_MB_GRP_WAS_TRIMMED(grp)	\
+	(test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_SET_TRIMMED(grp)	\
+	(set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
+	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
+
+#define EXT4_MAX_CONTENTION		8
+#define EXT4_CONTENTION_THRESHOLD	2
+
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+					      ext4_group_t group)
+{
+	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
+}
+
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+	return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+	if (spin_trylock(lock))
+		/*
+		 * We're able to grab the lock right away, so drop the
+		 * lock contention counter.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+	else {
+		/*
+		 * The lock is busy, so bump the contention counter,
+		 * and then wait on the spin lock.
+		 */
+		atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+				  EXT4_MAX_CONTENTION);
+		spin_lock(lock);
+	}
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+					ext4_group_t group)
+{
+	spin_unlock(ext4_group_lock_ptr(sb, group));
+}
+
+/*
+ * Block validity checking
+ */
+#define ext4_check_indirect_blockref(inode, bh)				\
+	ext4_check_blockref(__func__, __LINE__, inode,			\
+			    (__le32 *)(bh)->b_data,			\
+			    EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_ind_check_inode(inode)					\
+	ext4_check_blockref(__func__, __LINE__, inode,			\
+			    EXT4_I(inode)->i_data,			\
+			    EXT4_NDIR_BLOCKS)
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations ext4_dir_operations;
+
+/* file.c */
+extern const struct inode_operations ext4_file_inode_operations;
+extern const struct file_operations ext4_file_operations;
+extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+
+/* inline.c */
+extern int ext4_has_inline_data(struct inode *inode);
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data_nolock(struct inode *inode);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+				 unsigned int len);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
+					 struct inode *inode,
+					 loff_t pos, unsigned len,
+					 unsigned flags,
+					 struct page **pagep);
+extern int ext4_write_inline_data_end(struct inode *inode,
+				      loff_t pos, unsigned len,
+				      unsigned copied,
+				      struct page *page);
+extern struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+				  unsigned len,
+				  struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+					   struct inode *inode,
+					   loff_t pos, unsigned len,
+					   unsigned flags,
+					   struct page **pagep,
+					   void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+					 unsigned len, unsigned copied,
+					 struct page *page);
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+				     struct inode *inode);
+extern int ext4_try_create_inline_dir(handle_t *handle,
+				      struct inode *parent,
+				      struct inode *inode);
+extern int ext4_read_inline_dir(struct file *filp,
+				struct dir_context *ctx,
+				int *has_inline_data);
+extern int htree_inlinedir_to_tree(struct file *dir_file,
+				   struct inode *dir, ext4_lblk_t block,
+				   struct dx_hash_info *hinfo,
+				   __u32 start_hash, __u32 start_minor_hash,
+				   int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+					const struct qstr *d_name,
+					struct ext4_dir_entry_2 **res_dir,
+					int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+				    struct inode *dir,
+				    struct ext4_dir_entry_2 *de_del,
+				    struct buffer_head *bh,
+				    int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+					struct ext4_dir_entry_2 **parent_de,
+					int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+				   struct fiemap_extent_info *fieinfo,
+				   int *has_inline);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+					 struct inode *inode,
+					 int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+
+extern int ext4_convert_inline_data(struct inode *inode);
+
+/* namei.c */
+extern const struct inode_operations ext4_dir_inode_operations;
+extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
+extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+				 struct ext4_dir_entry_2 *de,
+				 int blocksize, int csum_size,
+				 unsigned int parent_ino, int dotdot_real_len);
+extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+				   unsigned int blocksize);
+extern int ext4_handle_dirty_dirent_node(handle_t *handle,
+					 struct inode *inode,
+					 struct buffer_head *bh);
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+				struct ext4_dir_entry_2 *de,
+				umode_t mode) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+
+/* symlink.c */
+extern const struct inode_operations ext4_symlink_inode_operations;
+extern const struct inode_operations ext4_fast_symlink_inode_operations;
+
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init ext4_init_system_zone(void);
+extern void ext4_exit_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+				 ext4_fsblk_t start_blk,
+				 unsigned int count);
+extern int ext4_check_blockref(const char *, unsigned int,
+			       struct inode *, __le32 *, unsigned int);
+
+/* extents.c */
+struct ext4_ext_path;
+struct ext4_extent;
+
+/*
+ * Maximum number of logical blocks in a file; ext4_extent's ee_block is
+ * __le32.
+ */
+#define EXT_MAX_BLOCKS	0xffffffff
+
+extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map, int flags);
+extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+				 ext4_lblk_t end);
+extern void ext4_ext_init(struct super_block *);
+extern void ext4_ext_release(struct super_block *);
+extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
+			  loff_t len);
+extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+					  loff_t offset, ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+			   struct ext4_map_blocks *map, int flags);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+					 ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+						   int num,
+						   struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+				      struct ext4_extent *ex1,
+				      struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *,
+				  struct ext4_ext_path *,
+				  struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+						  struct ext4_ext_path *,
+						  int flags);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_range(struct inode *inode,
+				    ext4_lblk_t lblk_start,
+				    ext4_lblk_t lblk_end);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+			__u64 start, __u64 len);
+extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+
+/* move_extent.c */
+extern void ext4_double_down_write_data_sem(struct inode *first,
+					    struct inode *second);
+extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
+					  struct inode *donor_inode);
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+			     __u64 start_orig, __u64 start_donor,
+			     __u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+			    struct ext4_extent **extent);
+
+/* page-io.c */
+extern int __init ext4_init_pageio(void);
+extern void ext4_exit_pageio(void);
+extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
+extern int ext4_put_io_end(ext4_io_end_t *io_end);
+extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
+extern void ext4_io_submit_init(struct ext4_io_submit *io,
+				struct writeback_control *wbc);
+extern void ext4_end_io_rsv_work(struct work_struct *work);
+extern void ext4_io_submit(struct ext4_io_submit *io);
+extern int ext4_bio_write_page(struct ext4_io_submit *io,
+			       struct page *page,
+			       int len,
+			       struct writeback_control *wbc,
+			       bool keep_towrite);
+
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
+/*
+ * Note that these flags will never ever appear in a buffer_head's state flag.
+ * See EXT4_MAP_... to see where this is used.
+ */
+enum ext4_state_bits {
+	BH_AllocFromCluster	/* allocated blocks were part of already
+				 * allocated cluster. */
+	= BH_JBDPrivateStart
+};
+
+/*
+ * Add new method to test whether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+	return (buffer_uptodate(bh) &&
+			test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
+
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+	ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+	smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+	smp_mb();
+	ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ		37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+					    EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+					     EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+#define EXT4_RESIZING	0
+extern int ext4_resize_begin(struct super_block *sb);
+extern void ext4_resize_end(struct super_block *sb);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
new file mode 100644
index 00000000000..a867f5ca999
--- /dev/null
+++ b/fs/ext4/ext4_extents.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+#ifndef _EXT4_EXTENTS
+#define _EXT4_EXTENTS
+
+#include "ext4.h"
+
+/*
+ * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks
+ * becomes very small, so index split, in-depth growing and
+ * other hard changes happen much more often.
+ * This is for debug purposes only.
+ */
+#define AGGRESSIVE_TEST_
+
+/*
+ * With EXTENTS_STATS defined, the number of blocks and extents
+ * are collected in the truncate path. They'll be shown at
+ * umount time.
+ */
+#define EXTENTS_STATS__
+
+/*
+ * If CHECK_BINSEARCH is defined, then the results of the binary search
+ * will also be checked by linear search.
+ */
+#define CHECK_BINSEARCH__
+
+/*
+ * If EXT_STATS is defined then stats numbers are collected.
+ * These number will be displayed at umount time.
+ */
+#define EXT_STATS_
+
+
+/*
+ * ext4_inode has i_block array (60 bytes total).
+ * The first 12 bytes store ext4_extent_header;
+ * the remainder stores an array of ext4_extent.
+ * For non-inode extent blocks, ext4_extent_tail
+ * follows the array.
+ */
+
+/*
+ * This is the extent tail on-disk structure.
+ * All other extent structures are 12 bytes long.  It turns out that
+ * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
+ * covers all valid ext4 block sizes.  Therefore, this tail structure can be
+ * crammed into the end of the block without having to rebalance the tree.
+ */
+struct ext4_extent_tail {
+	__le32	et_checksum;	/* crc32c(uuid+inum+extent_block) */
+};
+
+/*
+ * This is the extent on-disk structure.
+ * It's used at the bottom of the tree.
+ */
+struct ext4_extent {
+	__le32	ee_block;	/* first logical block extent covers */
+	__le16	ee_len;		/* number of blocks covered by extent */
+	__le16	ee_start_hi;	/* high 16 bits of physical block */
+	__le32	ee_start_lo;	/* low 32 bits of physical block */
+};
+
+/*
+ * This is index on-disk structure.
+ * It's used at all the levels except the bottom.
+ */
+struct ext4_extent_idx {
+	__le32	ei_block;	/* index covers logical blocks from 'block' */
+	__le32	ei_leaf_lo;	/* pointer to the physical block of the next *
+				 * level. leaf or next index could be there */
+	__le16	ei_leaf_hi;	/* high 16 bits of physical block */
+	__u16	ei_unused;
+};
+
+/*
+ * Each block (leaves and indexes), even inode-stored has header.
+ */
+struct ext4_extent_header {
+	__le16	eh_magic;	/* probably will support different formats */
+	__le16	eh_entries;	/* number of valid entries */
+	__le16	eh_max;		/* capacity of store in entries */
+	__le16	eh_depth;	/* has tree real underlying blocks? */
+	__le32	eh_generation;	/* generation of the tree */
+};
+
+#define EXT4_EXT_MAGIC		cpu_to_le16(0xf30a)
+
+#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
+	(sizeof(struct ext4_extent_header) + \
+	 (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))
+
+static inline struct ext4_extent_tail *
+find_ext4_extent_tail(struct ext4_extent_header *eh)
+{
+	return (struct ext4_extent_tail *)(((void *)eh) +
+					   EXT4_EXTENT_TAIL_OFFSET(eh));
+}
+
+/*
+ * Array of ext4_ext_path contains path to some extent.
+ * Creation/lookup routines use it for traversal/splitting/etc.
+ * Truncate uses it to simulate recursive walking.
+ */
+struct ext4_ext_path {
+	ext4_fsblk_t			p_block;
+	__u16				p_depth;
+	struct ext4_extent		*p_ext;
+	struct ext4_extent_idx		*p_idx;
+	struct ext4_extent_header	*p_hdr;
+	struct buffer_head		*p_bh;
+};
+
+/*
+ * structure for external API
+ */
+
+/*
+ * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
+ * initialized extent. This is 2^15 and not (2^16 - 1), since we use the
+ * MSB of ee_len field in the extent datastructure to signify if this
+ * particular extent is an initialized extent or an unwritten (i.e.
+ * preallocated).
+ * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
+ * unwritten extent.
+ * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
+ * unwritten one. In other words, if MSB of ee_len is set, it is an
+ * unwritten extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an unwritten extent of zero length and
+ * thus we make it as a special case of initialized extent with 0x8000 length.
+ * This way we get better extent-to-group alignment for initialized extents.
+ * Hence, the maximum number of blocks we can have in an *initialized*
+ * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
+ */
+#define EXT_INIT_MAX_LEN	(1UL << 15)
+#define EXT_UNWRITTEN_MAX_LEN	(EXT_INIT_MAX_LEN - 1)
+
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+	((struct ext4_extent *) (((char *) (__hdr__)) +		\
+				 sizeof(struct ext4_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+	((struct ext4_extent_idx *) (((char *) (__hdr__)) +	\
+				     sizeof(struct ext4_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+	(le16_to_cpu((__path__)->p_hdr->eh_entries) \
+				     < le16_to_cpu((__path__)->p_hdr->eh_max))
+#define EXT_LAST_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+
+static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
+{
+	return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
+}
+
+static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
+{
+	return (struct ext4_extent_header *) bh->b_data;
+}
+
+static inline unsigned short ext_depth(struct inode *inode)
+{
+	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
+}
+
+static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
+{
+	/* We can not have an unwritten extent of zero length! */
+	BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
+	ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
+{
+	/* Extent with ee_len of 0x8000 is treated as an initialized extent */
+	return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
+}
+
+static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
+{
+	return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ?
+		le16_to_cpu(ext->ee_len) :
+		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
+}
+
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+	ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
+/*
+ * ext4_ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ex->ee_start_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ix->ei_leaf_lo);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex,
+					 ext4_fsblk_t pb)
+{
+	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				      0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
+					 ext4_fsblk_t pb)
+{
+	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) &
+				     0xffff);
+}
+
+#define ext4_ext_dirty(handle, inode, path) \
+		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+		     struct inode *inode, struct ext4_ext_path *path);
+
+#endif /* _EXT4_EXTENTS */
+
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d6afe4e2734..0074e0d23d6 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -2,58 +2,320 @@
  * Interface between ext4 and JBD
  */
 
-#include <linux/ext4_jbd2.h>
+#include "ext4_jbd2.h"
 
-int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
+#include <trace/events/ext4.h>
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
 {
-	int err = jbd2_journal_get_undo_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
-	return err;
+	handle_t *handle = current->journal_info;
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+	ref_cnt++;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+	return handle;
 }
 
-int __ext4_journal_get_write_access(const char *where, handle_t *handle,
-				struct buffer_head *bh)
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
 {
-	int err = jbd2_journal_get_write_access(handle, bh);
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt == 0);
+
+	ref_cnt--;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ */
+static int ext4_journal_check_start(struct super_block *sb)
+{
+	journal_t *journal;
+
+	might_sleep();
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
+	journal = EXT4_SB(sb)->s_journal;
+	/*
+	 * Special case here: if the journal has aborted behind our
+	 * backs (eg. EIO in the commit thread), then we still need to
+	 * take the FS itself readonly cleanly.
+	 */
+	if (journal && is_journal_aborted(journal)) {
+		ext4_abort(sb, "Detected aborted journal");
+		return -EROFS;
+	}
+	return 0;
+}
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+				  int type, int blocks, int rsv_blocks)
+{
+	journal_t *journal;
+	int err;
+
+	trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
+	err = ext4_journal_check_start(sb);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	journal = EXT4_SB(sb)->s_journal;
+	if (!journal)
+		return ext4_get_nojournal();
+	return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
+				   type, line);
+}
+
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
+{
+	struct super_block *sb;
+	int err;
+	int rc;
+
+	if (!ext4_handle_valid(handle)) {
+		ext4_put_nojournal(handle);
+		return 0;
+	}
+	sb = handle->h_transaction->t_journal->j_private;
+	err = handle->h_err;
+	rc = jbd2_journal_stop(handle);
+
+	if (!err)
+		err = rc;
 	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+		__ext4_std_error(sb, where, line, err);
 	return err;
 }
 
-int __ext4_journal_forget(const char *where, handle_t *handle,
-				struct buffer_head *bh)
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+					int type)
 {
-	int err = jbd2_journal_forget(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	struct super_block *sb;
+	int err;
+
+	if (!ext4_handle_valid(handle))
+		return ext4_get_nojournal();
+
+	sb = handle->h_journal->j_private;
+	trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
+					  _RET_IP_);
+	err = ext4_journal_check_start(sb);
+	if (err < 0) {
+		jbd2_journal_free_reserved(handle);
+		return ERR_PTR(err);
+	}
+
+	err = jbd2_journal_start_reserved(handle, type, line);
+	if (err < 0)
+		return ERR_PTR(err);
+	return handle;
+}
+
+static void ext4_journal_abort_handle(const char *caller, unsigned int line,
+				      const char *err_fn,
+				      struct buffer_head *bh,
+				      handle_t *handle, int err)
+{
+	char nbuf[16];
+	const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+	BUG_ON(!ext4_handle_valid(handle));
+
+	if (bh)
+		BUFFER_TRACE(bh, "abort");
+
+	if (!handle->h_err)
+		handle->h_err = err;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
+	       caller, line, errstr, err_fn);
+
+	jbd2_journal_abort_handle(handle);
+}
+
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+				    handle_t *handle, struct buffer_head *bh)
+{
+	int err = 0;
+
+	might_sleep();
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_write_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
-int __ext4_journal_revoke(const char *where, handle_t *handle,
-				ext4_fsblk_t blocknr, struct buffer_head *bh)
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+		  int is_metadata, struct inode *inode,
+		  struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
-	int err = jbd2_journal_revoke(handle, blocknr, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	int err;
+
+	might_sleep();
+
+	trace_ext4_forget(inode, is_metadata, blocknr);
+	BUFFER_TRACE(bh, "enter");
+
+	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+		  "data mode %x\n",
+		  bh, is_metadata, inode->i_mode,
+		  test_opt(inode->i_sb, DATA_FLAGS));
+
+	/* In the no journal case, we can just do a bforget and return */
+	if (!ext4_handle_valid(handle)) {
+		bforget(bh);
+		return 0;
+	}
+
+	/* Never use the revoke function if we are doing full data
+	 * journaling: there is no need to, and a V1 superblock won't
+	 * support it.  Otherwise, only skip the revoke on un-journaled
+	 * data blocks. */
+
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    (!is_metadata && !ext4_should_journal_data(inode))) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call jbd2_journal_forget");
+			err = jbd2_journal_forget(handle, bh);
+			if (err)
+				ext4_journal_abort_handle(where, line, __func__,
+							  bh, handle, err);
+			return err;
+		}
+		return 0;
+	}
+
+	/*
+	 * data!=journal && (is_metadata || should_journal_data(inode))
+	 */
+	BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+	err = jbd2_journal_revoke(handle, blocknr, bh);
+	if (err) {
+		ext4_journal_abort_handle(where, line, __func__,
+					  bh, handle, err);
+		__ext4_abort(inode->i_sb, where, line,
+			   "error %d when attempting revoke", err);
+	}
+	BUFFER_TRACE(bh, "exit");
 	return err;
 }
 
-int __ext4_journal_get_create_access(const char *where,
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
 				handle_t *handle, struct buffer_head *bh)
 {
-	int err = jbd2_journal_get_create_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_create_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
+	}
 	return err;
 }
 
-int __ext4_journal_dirty_metadata(const char *where,
-				handle_t *handle, struct buffer_head *bh)
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+				 handle_t *handle, struct inode *inode,
+				 struct buffer_head *bh)
 {
-	int err = jbd2_journal_dirty_metadata(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+	int err = 0;
+
+	might_sleep();
+
+	set_buffer_meta(bh);
+	set_buffer_prio(bh);
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_dirty_metadata(handle, bh);
+		/* Errors can only happen if there is a bug */
+		if (WARN_ON_ONCE(err)) {
+			ext4_journal_abort_handle(where, line, __func__, bh,
+						  handle, err);
+			if (inode == NULL) {
+				pr_err("EXT4: jbd2_journal_dirty_metadata "
+				       "failed: handle type %u started at "
+				       "line %u, credits %u/%u, errcode %d",
+				       handle->h_type,
+				       handle->h_line_no,
+				       handle->h_requested_credits,
+				       handle->h_buffer_credits, err);
+				return err;
+			}
+			ext4_error_inode(inode, where, line,
+					 bh->b_blocknr,
+					 "journal_dirty_metadata failed: "
+					 "handle type %u started at line %u, "
+					 "credits %u/%u, errcode %d",
+					 handle->h_type,
+					 handle->h_line_no,
+					 handle->h_requested_credits,
+					 handle->h_buffer_credits, err);
+		}
+	} else {
+		if (inode)
+			mark_buffer_dirty_inode(bh, inode);
+		else
+			mark_buffer_dirty(bh);
+		if (inode && inode_needs_sync(inode)) {
+			sync_dirty_buffer(bh);
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+				struct ext4_super_block *es;
+
+				es = EXT4_SB(inode->i_sb)->s_es;
+				es->s_last_error_block =
+					cpu_to_le64(bh->b_blocknr);
+				ext4_error_inode(inode, where, line,
+						 bh->b_blocknr,
+					"IO error syncing itable block");
+				err = -EIO;
+			}
+		}
+	}
+	return err;
+}
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+			      handle_t *handle, struct super_block *sb)
+{
+	struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
+	int err = 0;
+
+	ext4_superblock_csum_set(sb);
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_dirty_metadata(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, line, __func__,
+						  bh, handle, err);
+	} else
+		mark_buffer_dirty(bh);
 	return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
new file mode 100644
index 00000000000..17c00ff202f
--- /dev/null
+++ b/fs/ext4/ext4_jbd2.h
@@ -0,0 +1,450 @@
+/*
+ * ext4_jbd2.h
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Ext4-specific journaling extensions.
+ */
+
+#ifndef _EXT4_JBD2_H
+#define _EXT4_JBD2_H
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include "ext4.h"
+
+#define EXT4_JOURNAL(inode)	(EXT4_SB((inode)->i_sb)->s_journal)
+
+/* Define the number of blocks we need to account to a transaction to
+ * modify one block of data.
+ *
+ * We may have to touch one inode, one bitmap buffer, up to three
+ * indirection blocks, the group and superblock summaries, and the data
+ * block to complete the transaction.
+ *
+ * For extents-enabled fs we may have to allocate and modify up to
+ * 5 levels of tree, data block (for each of these we need bitmap + group
+ * summaries), root which is stored in the inode, sb
+ */
+
+#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
+	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
+	 ? 20U : 8U)
+
+/* Extended attribute operations touch at most two data buffers,
+ * two bitmap buffers, and two group summaries, in addition to the inode
+ * and the superblock, which are already accounted for. */
+
+#define EXT4_XATTR_TRANS_BLOCKS		6U
+
+/* Define the minimum size for a transaction which modifies data.  This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota).  The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+#define EXT4_DATA_TRANS_BLOCKS(sb)	(EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
+					 EXT4_XATTR_TRANS_BLOCKS - 2 + \
+					 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb)	(EXT4_XATTR_TRANS_BLOCKS + \
+					EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+/* Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction.  For unbounded transactions such as
+ * write(2) and truncate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go. */
+
+#define EXT4_MAX_TRANS_DATA		64U
+
+/* We break up a large truncate or write transaction once the handle's
+ * buffer credits gets this low, we need either to extend the
+ * transaction or to start a new one.  Reserve enough space here for
+ * inode, bitmap, superblock, group and indirection updates for at least
+ * one block, plus two quota updates.  Quota allocations are not
+ * needed. */
+
+#define EXT4_RESERVE_TRANS_BLOCKS	12U
+
+#define EXT4_INDEX_EXTRA_TRANS_BLOCKS	8
+
+#ifdef CONFIG_QUOTA
+/* Amount of blocks needed for quota update - we know that the structure was
+ * allocated so we need to update only data block */
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+		EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+		1 : 0)
+/* Amount of blocks needed for quota insert/delete - we do some block writes
+ * but inode, sb and group updates are done only once */
+#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+		EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+		(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+		 +3+DQUOT_INIT_REWRITE) : 0)
+
+#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
+		EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+		(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+		 +3+DQUOT_DEL_REWRITE) : 0)
+#else
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
+#endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+
+static inline int ext4_jbd2_credits_xattr(struct inode *inode)
+{
+	int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
+
+	/*
+	 * In case of inline data, we may push out the data to a block,
+	 * so we need to reserve credits for this eventuality
+	 */
+	if (ext4_has_inline_data(inode))
+		credits += ext4_writepage_trans_blocks(inode) + 1;
+	return credits;
+}
+
+
+/*
+ * Ext4 handle operation types -- for logging purposes
+ */
+#define EXT4_HT_MISC             0
+#define EXT4_HT_INODE            1
+#define EXT4_HT_WRITE_PAGE       2
+#define EXT4_HT_MAP_BLOCKS       3
+#define EXT4_HT_DIR              4
+#define EXT4_HT_TRUNCATE         5
+#define EXT4_HT_QUOTA            6
+#define EXT4_HT_RESIZE           7
+#define EXT4_HT_MIGRATE          8
+#define EXT4_HT_MOVE_EXTENTS     9
+#define EXT4_HT_XATTR           10
+#define EXT4_HT_EXT_CONVERT     11
+#define EXT4_HT_MAX             12
+
+/**
+ *   struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ *   This struct is a 'seed' structure for a using with your own callback
+ *   structs. If you are using callbacks you must allocate one of these
+ *   or another struct of your own definition which has this struct
+ *   as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+	/* list information for other callbacks attached to the same handle */
+	struct list_head jce_list;
+
+	/*  Function to call with this callback structure */
+	void (*jce_func)(struct super_block *sb,
+			 struct ext4_journal_cb_entry *jce, int error);
+
+	/* user data goes here */
+};
+
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ *        @sb: superblock of current filesystem for transaction
+ *        @jce: returned journal callback data
+ *        @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+			void (*func)(struct super_block *sb,
+				     struct ext4_journal_cb_entry *jce,
+				     int rc),
+			struct ext4_journal_cb_entry *jce)
+{
+	struct ext4_sb_info *sbi =
+			EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+	/* Add the jce to transaction's private list */
+	jce->jce_func = func;
+	spin_lock(&sbi->s_md_lock);
+	list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+	spin_unlock(&sbi->s_md_lock);
+}
+
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ * Return true if object was successfully removed
+ */
+static inline bool ext4_journal_callback_try_del(handle_t *handle,
+					     struct ext4_journal_cb_entry *jce)
+{
+	bool deleted;
+	struct ext4_sb_info *sbi =
+			EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+	spin_lock(&sbi->s_md_lock);
+	deleted = !list_empty(&jce->jce_list);
+	list_del_init(&jce->jce_list);
+	spin_unlock(&sbi->s_md_lock);
+	return deleted;
+}
+
+int
+ext4_mark_iloc_dirty(handle_t *handle,
+		     struct inode *inode,
+		     struct ext4_iloc *iloc);
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+			struct ext4_iloc *iloc);
+
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+
+/*
+ * Wrapper functions with which ext4 calls into JBD.
+ */
+int __ext4_journal_get_write_access(const char *where, unsigned int line,
+				    handle_t *handle, struct buffer_head *bh);
+
+int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
+		  int is_metadata, struct inode *inode,
+		  struct buffer_head *bh, ext4_fsblk_t blocknr);
+
+int __ext4_journal_get_create_access(const char *where, unsigned int line,
+				handle_t *handle, struct buffer_head *bh);
+
+int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
+				 handle_t *handle, struct inode *inode,
+				 struct buffer_head *bh);
+
+int __ext4_handle_dirty_super(const char *where, unsigned int line,
+			      handle_t *handle, struct super_block *sb);
+
+#define ext4_journal_get_write_access(handle, bh) \
+	__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
+	__ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
+		      (bh), (block_nr))
+#define ext4_journal_get_create_access(handle, bh) \
+	__ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+	__ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
+				     (bh))
+#define ext4_handle_dirty_super(handle, sb) \
+	__ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
+
+handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
+				  int type, int blocks, int rsv_blocks);
+int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
+
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+
+/* Note:  Do not use this for NULL handles.  This is only to determine if
+ * a properly allocated handle is using a journal or not. */
+static inline int ext4_handle_valid(handle_t *handle)
+{
+	if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
+		return 0;
+	return 1;
+}
+
+static inline void ext4_handle_sync(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		handle->h_sync = 1;
+}
+
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		return is_handle_aborted(handle);
+	return 0;
+}
+
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+	if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+		return 0;
+	return 1;
+}
+
+#define ext4_journal_start_sb(sb, type, nblocks)			\
+	__ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start(inode, type, nblocks)			\
+	__ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
+
+#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
+	__ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
+
+static inline handle_t *__ext4_journal_start(struct inode *inode,
+					     unsigned int line, int type,
+					     int blocks, int rsv_blocks)
+{
+	return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
+				       rsv_blocks);
+}
+
+#define ext4_journal_stop(handle) \
+	__ext4_journal_stop(__func__, __LINE__, (handle))
+
+#define ext4_journal_start_reserved(handle, type) \
+	__ext4_journal_start_reserved((handle), __LINE__, (type))
+
+handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
+					int type);
+
+static inline void ext4_journal_free_reserved(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_free_reserved(handle);
+}
+
+static inline handle_t *ext4_journal_current_handle(void)
+{
+	return journal_current_handle();
+}
+
+static inline int ext4_journal_extend(handle_t *handle, int nblocks)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_extend(handle, nblocks);
+	return 0;
+}
+
+static inline int ext4_journal_restart(handle_t *handle, int nblocks)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_restart(handle, nblocks);
+	return 0;
+}
+
+static inline int ext4_journal_blocks_per_page(struct inode *inode)
+{
+	if (EXT4_JOURNAL(inode) != NULL)
+		return jbd2_journal_blocks_per_page(inode);
+	return 0;
+}
+
+static inline int ext4_journal_force_commit(journal_t *journal)
+{
+	if (journal)
+		return jbd2_journal_force_commit(journal);
+	return 0;
+}
+
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
+	return 0;
+}
+
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+						 struct inode *inode,
+						 int datasync)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		if (datasync)
+			ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+}
+
+/* super.c */
+int ext4_force_commit(struct super_block *sb);
+
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE	0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE	0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE	0x04 /* writeback data mode */
+
+static inline int ext4_inode_journal_mode(struct inode *inode)
+{
+	if (EXT4_JOURNAL(inode) == NULL)
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	/* We do not support data journalling with delayed allocation */
+	if (!S_ISREG(inode->i_mode) ||
+	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+	    !test_opt(inode->i_sb, DELALLOC))
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+		return EXT4_INODE_ORDERED_DATA_MODE;	/* ordered */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	else
+		BUG();
+}
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
+}
+
+static inline int ext4_should_order_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
+}
+
+static inline int ext4_should_writeback_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
+}
+
+/*
+ * This function controls whether or not we should try to go down the
+ * dioread_nolock code paths, which makes it safe to avoid taking
+ * i_mutex for direct I/O reads.  This only works for extent-based
+ * files, and it doesn't work if data journaling is enabled, since the
+ * dioread_nolock code uses b_private to pass information back to the
+ * I/O completion handler, and this conflicts with the jbd's use of
+ * b_private.
+ */
+static inline int ext4_should_dioread_nolock(struct inode *inode)
+{
+	if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+		return 0;
+	if (!S_ISREG(inode->i_mode))
+		return 0;
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return 0;
+	if (ext4_should_journal_data(inode))
+		return 0;
+	return 1;
+}
+
+#endif	/* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9ae6e67090c..4da228a0e6d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -29,80 +29,108 @@
  *   - smart tree reduction
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/falloc.h>
-#include <linux/ext4_fs_extents.h>
 #include <asm/uaccess.h>
+#include <linux/fiemap.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "xattr.h"
 
+#include <trace/events/ext4.h>
 
 /*
- * ext_pblock:
- * combine low and high parts of physical block number into ext4_fsblk_t
+ * used by extent splitting.
  */
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
-{
-	ext4_fsblk_t block;
+#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
+					due to ENOSPC */
+#define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
+#define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */
 
-	block = le32_to_cpu(ex->ee_start_lo);
-	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
-	return block;
-}
+#define EXT4_EXT_DATA_VALID1	0x8  /* first half contains valid data */
+#define EXT4_EXT_DATA_VALID2	0x10 /* second half contains valid data */
 
-/*
- * idx_pblock:
- * combine low and high parts of a leaf physical block number into ext4_fsblk_t
- */
-ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
+static __le32 ext4_extent_block_csum(struct inode *inode,
+				     struct ext4_extent_header *eh)
 {
-	ext4_fsblk_t block;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	__u32 csum;
 
-	block = le32_to_cpu(ix->ei_leaf_lo);
-	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
-	return block;
+	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+			   EXT4_EXTENT_TAIL_OFFSET(eh));
+	return cpu_to_le32(csum);
 }
 
-/*
- * ext4_ext_store_pblock:
- * stores a large physical block number into an extent struct,
- * breaking it into parts
- */
-void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+static int ext4_extent_block_csum_verify(struct inode *inode,
+					 struct ext4_extent_header *eh)
 {
-	ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+	struct ext4_extent_tail *et;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
+
+	et = find_ext4_extent_tail(eh);
+	if (et->et_checksum != ext4_extent_block_csum(inode, eh))
+		return 0;
+	return 1;
 }
 
-/*
- * ext4_idx_store_pblock:
- * stores a large physical block number into an index struct,
- * breaking it into parts
- */
-static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
+static void ext4_extent_block_csum_set(struct inode *inode,
+				       struct ext4_extent_header *eh)
 {
-	ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff));
-	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+	struct ext4_extent_tail *et;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return;
+
+	et = find_ext4_extent_tail(eh);
+	et->et_checksum = ext4_extent_block_csum(inode, eh);
 }
 
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_split_extent(handle_t *handle,
+				struct inode *inode,
+				struct ext4_ext_path *path,
+				struct ext4_map_blocks *map,
+				int split_flag,
+				int flags);
+
+static int ext4_split_extent_at(handle_t *handle,
+			     struct inode *inode,
+			     struct ext4_ext_path *path,
+			     ext4_lblk_t split,
+			     int split_flag,
+			     int flags);
+
+static int ext4_find_delayed_extent(struct inode *inode,
+				    struct extent_status *newes);
+
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+					    struct inode *inode,
+					    int needed)
 {
 	int err;
 
+	if (!ext4_handle_valid(handle))
+		return 0;
 	if (handle->h_buffer_credits > needed)
-		return handle;
-	if (!ext4_journal_extend(handle, needed))
-		return handle;
-	err = ext4_journal_restart(handle, needed);
+		return 0;
+	err = ext4_journal_extend(handle, needed);
+	if (err <= 0)
+		return err;
+	err = ext4_truncate_restart_trans(handle, inode, needed);
+	if (err == 0)
+		err = -EAGAIN;
 
-	return handle;
+	return err;
 }
 
 /*
@@ -115,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
 {
 	if (path->p_bh) {
 		/* path points to block */
+		BUFFER_TRACE(path->p_bh, "get_write_access");
 		return ext4_journal_get_write_access(handle, path->p_bh);
 	}
 	/* path points to leaf/index in inode body */
@@ -128,13 +157,15 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
  *  - ENOMEM
  *  - EIO
  */
-static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
-				struct ext4_ext_path *path)
+int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
+		     struct inode *inode, struct ext4_ext_path *path)
 {
 	int err;
 	if (path->p_bh) {
+		ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
 		/* path points to block */
-		err = ext4_journal_dirty_metadata(handle, path->p_bh);
+		err = __ext4_handle_dirty_metadata(where, line, handle,
+						   inode, path->p_bh);
 	} else {
 		/* path points to leaf/index in inode body */
 		err = ext4_mark_inode_dirty(handle, inode);
@@ -146,20 +177,37 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 			      struct ext4_ext_path *path,
 			      ext4_lblk_t block)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	ext4_fsblk_t bg_start;
-	ext4_fsblk_t last_block;
-	ext4_grpblk_t colour;
-	int depth;
-
 	if (path) {
+		int depth = path->p_depth;
 		struct ext4_extent *ex;
-		depth = path->p_depth;
 
-		/* try to predict block placement */
+		/*
+		 * Try to predict block placement assuming that we are
+		 * filling in a file which will eventually be
+		 * non-sparse --- i.e., in the case of libbfd writing
+		 * an ELF object sections out-of-order but in a way
+		 * the eventually results in a contiguous object or
+		 * executable file, or some database extending a table
+		 * space file.  However, this is actually somewhat
+		 * non-ideal if we are writing a sparse file such as
+		 * qemu or KVM writing a raw image file that is going
+		 * to stay fairly sparse, since it will end up
+		 * fragmenting the file system's free space.  Maybe we
+		 * should have some hueristics or some way to allow
+		 * userspace to pass a hint to file system,
+		 * especially if the latter case turns out to be
+		 * common.
+		 */
 		ex = path[depth].p_ext;
-		if (ex)
-			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+		if (ex) {
+			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
+			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
+
+			if (block > ext_block)
+				return ext_pblk + (block - ext_block);
+			else
+				return ext_pblk - (ext_block - block);
+		}
 
 		/* it looks like index is empty;
 		 * try to find starting block from index itself */
@@ -168,57 +216,52 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	}
 
 	/* OK. use inode's group */
-	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
-		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
-	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-
-	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
-		colour = (current->pid % 16) *
-			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
-	else
-		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
-	return bg_start + colour + block;
+	return ext4_inode_to_goal_block(inode);
 }
 
+/*
+ * Allocation for a meta data block
+ */
 static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 			struct ext4_ext_path *path,
-			struct ext4_extent *ex, int *err)
+			struct ext4_extent *ex, int *err, unsigned int flags)
 {
 	ext4_fsblk_t goal, newblock;
 
 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_block(handle, inode, goal, err);
+	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+					NULL, err);
 	return newblock;
 }
 
-static int ext4_ext_space_block(struct inode *inode)
+static inline int ext4_ext_space_block(struct inode *inode, int check)
 {
 	int size;
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent);
 #ifdef AGGRESSIVE_TEST
-	if (size > 6)
+	if (!check && size > 6)
 		size = 6;
 #endif
 	return size;
 }
 
-static int ext4_ext_space_block_idx(struct inode *inode)
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 {
 	int size;
 
 	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
 			/ sizeof(struct ext4_extent_idx);
 #ifdef AGGRESSIVE_TEST
-	if (size > 5)
+	if (!check && size > 5)
 		size = 5;
 #endif
 	return size;
 }
 
-static int ext4_ext_space_root(struct inode *inode)
+static inline int ext4_ext_space_root(struct inode *inode, int check)
 {
 	int size;
 
@@ -226,13 +269,13 @@ static int ext4_ext_space_root(struct inode *inode)
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent);
 #ifdef AGGRESSIVE_TEST
-	if (size > 3)
+	if (!check && size > 3)
 		size = 3;
 #endif
 	return size;
 }
 
-static int ext4_ext_space_root_idx(struct inode *inode)
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 {
 	int size;
 
@@ -240,12 +283,59 @@ static int ext4_ext_space_root_idx(struct inode *inode)
 	size -= sizeof(struct ext4_extent_header);
 	size /= sizeof(struct ext4_extent_idx);
 #ifdef AGGRESSIVE_TEST
-	if (size > 4)
+	if (!check && size > 4)
 		size = 4;
 #endif
 	return size;
 }
 
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int idxs;
+
+	idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+		/ sizeof(struct ext4_extent_idx));
+
+	/*
+	 * If the new delayed allocation block is contiguous with the
+	 * previous da block, it can share index blocks with the
+	 * previous block, so we only need to allocate a new index
+	 * block every idxs leaf blocks.  At ldxs**2 blocks, we need
+	 * an additional index block, and at ldxs**3 blocks, yet
+	 * another index blocks.
+	 */
+	if (ei->i_da_metadata_calc_len &&
+	    ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+		int num = 0;
+
+		if ((ei->i_da_metadata_calc_len % idxs) == 0)
+			num++;
+		if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
+			num++;
+		if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
+			num++;
+			ei->i_da_metadata_calc_len = 0;
+		} else
+			ei->i_da_metadata_calc_len++;
+		ei->i_da_metadata_calc_last_lblock++;
+		return num;
+	}
+
+	/*
+	 * In the worst case we need a new set of index blocks at
+	 * every level of the inode's extent tree.
+	 */
+	ei->i_da_metadata_calc_len = 1;
+	ei->i_da_metadata_calc_last_lblock = lblock;
+	return ext_depth(inode) + 1;
+}
+
 static int
 ext4_ext_max_entries(struct inode *inode, int depth)
 {
@@ -253,22 +343,88 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 	if (depth == ext_depth(inode)) {
 		if (depth == 0)
-			max = ext4_ext_space_root(inode);
+			max = ext4_ext_space_root(inode, 1);
 		else
-			max = ext4_ext_space_root_idx(inode);
+			max = ext4_ext_space_root_idx(inode, 1);
 	} else {
 		if (depth == 0)
-			max = ext4_ext_space_block(inode);
+			max = ext4_ext_space_block(inode, 1);
 		else
-			max = ext4_ext_space_block_idx(inode);
+			max = ext4_ext_space_block_idx(inode, 1);
 	}
 
 	return max;
 }
 
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
-					struct ext4_extent_header *eh,
-					int depth)
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+	ext4_fsblk_t block = ext4_ext_pblock(ext);
+	int len = ext4_ext_get_actual_len(ext);
+	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
+	ext4_lblk_t last = lblock + len - 1;
+
+	if (lblock > last)
+		return 0;
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+				struct ext4_extent_idx *ext_idx)
+{
+	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
+
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+				struct ext4_extent_header *eh,
+				int depth)
+{
+	unsigned short entries;
+	if (eh->eh_entries == 0)
+		return 1;
+
+	entries = le16_to_cpu(eh->eh_entries);
+
+	if (depth == 0) {
+		/* leaf entries */
+		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+		ext4_fsblk_t pblock = 0;
+		ext4_lblk_t lblock = 0;
+		ext4_lblk_t prev = 0;
+		int len = 0;
+		while (entries) {
+			if (!ext4_valid_extent(inode, ext))
+				return 0;
+
+			/* Check for overlapping extents */
+			lblock = le32_to_cpu(ext->ee_block);
+			len = ext4_ext_get_actual_len(ext);
+			if ((lblock <= prev) && prev) {
+				pblock = ext4_ext_pblock(ext);
+				es->s_last_error_block = cpu_to_le64(pblock);
+				return 0;
+			}
+			ext++;
+			entries--;
+			prev = lblock + len - 1;
+		}
+	} else {
+		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+		while (entries) {
+			if (!ext4_valid_extent_idx(inode, ext_idx))
+				return 0;
+			ext_idx++;
+			entries--;
+		}
+	}
+	return 1;
+}
+
+static int __ext4_ext_check(const char *function, unsigned int line,
+			    struct inode *inode, struct ext4_extent_header *eh,
+			    int depth, ext4_fsblk_t pblk)
 {
 	const char *error_msg;
 	int max = 0;
@@ -294,21 +450,163 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
 		error_msg = "invalid eh_entries";
 		goto corrupted;
 	}
+	if (!ext4_valid_extent_entries(inode, eh, depth)) {
+		error_msg = "invalid extent entries";
+		goto corrupted;
+	}
+	/* Verify checksum on non-root extent tree nodes */
+	if (ext_depth(inode) != depth &&
+	    !ext4_extent_block_csum_verify(inode, eh)) {
+		error_msg = "extent tree corrupted";
+		goto corrupted;
+	}
 	return 0;
 
 corrupted:
-	ext4_error(inode->i_sb, function,
-			"bad header in inode #%lu: %s - magic %x, "
-			"entries %u, max %u(%u), depth %u(%u)",
-			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
-			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
-			max, le16_to_cpu(eh->eh_depth), depth);
-
+	ext4_error_inode(inode, function, line, 0,
+			 "pblk %llu bad header/extent: %s - magic %x, "
+			 "entries %u, max %u(%u), depth %u(%u)",
+			 (unsigned long long) pblk, error_msg,
+			 le16_to_cpu(eh->eh_magic),
+			 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+			 max, le16_to_cpu(eh->eh_depth), depth);
 	return -EIO;
 }
 
-#define ext4_ext_check_header(inode, eh, depth)	\
-	__ext4_ext_check_header(__FUNCTION__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth, pblk)			\
+	__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
+}
+
+static struct buffer_head *
+__read_extent_tree_block(const char *function, unsigned int line,
+			 struct inode *inode, ext4_fsblk_t pblk, int depth,
+			 int flags)
+{
+	struct buffer_head		*bh;
+	int				err;
+
+	bh = sb_getblk(inode->i_sb, pblk);
+	if (unlikely(!bh))
+		return ERR_PTR(-ENOMEM);
+
+	if (!bh_uptodate_or_lock(bh)) {
+		trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
+		err = bh_submit_read(bh);
+		if (err < 0)
+			goto errout;
+	}
+	if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
+		return bh;
+	err = __ext4_ext_check(function, line, inode,
+			       ext_block_hdr(bh), depth, pblk);
+	if (err)
+		goto errout;
+	set_buffer_verified(bh);
+	/*
+	 * If this is a leaf block, cache all of its entries
+	 */
+	if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
+		struct ext4_extent_header *eh = ext_block_hdr(bh);
+		struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+		ext4_lblk_t prev = 0;
+		int i;
+
+		for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+			unsigned int status = EXTENT_STATUS_WRITTEN;
+			ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+			int len = ext4_ext_get_actual_len(ex);
+
+			if (prev && (prev != lblk))
+				ext4_es_cache_extent(inode, prev,
+						     lblk - prev, ~0,
+						     EXTENT_STATUS_HOLE);
+
+			if (ext4_ext_is_unwritten(ex))
+				status = EXTENT_STATUS_UNWRITTEN;
+			ext4_es_cache_extent(inode, lblk, len,
+					     ext4_ext_pblock(ex), status);
+			prev = lblk + len;
+		}
+	}
+	return bh;
+errout:
+	put_bh(bh);
+	return ERR_PTR(err);
+
+}
+
+#define read_extent_tree_block(inode, pblk, depth, flags)		\
+	__read_extent_tree_block(__func__, __LINE__, (inode), (pblk),   \
+				 (depth), (flags))
+
+/*
+ * This function is called to cache a file's extent information in the
+ * extent status tree
+ */
+int ext4_ext_precache(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_ext_path *path = NULL;
+	struct buffer_head *bh;
+	int i = 0, depth, ret = 0;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		return 0;	/* not an extent-mapped inode */
+
+	down_read(&ei->i_data_sem);
+	depth = ext_depth(inode);
+
+	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+		       GFP_NOFS);
+	if (path == NULL) {
+		up_read(&ei->i_data_sem);
+		return -ENOMEM;
+	}
+
+	/* Don't cache anything if there are no external extent blocks */
+	if (depth == 0)
+		goto out;
+	path[0].p_hdr = ext_inode_hdr(inode);
+	ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
+	if (ret)
+		goto out;
+	path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
+	while (i >= 0) {
+		/*
+		 * If this is a leaf block or we've reached the end of
+		 * the index block, go up
+		 */
+		if ((i == depth) ||
+		    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
+			brelse(path[i].p_bh);
+			path[i].p_bh = NULL;
+			i--;
+			continue;
+		}
+		bh = read_extent_tree_block(inode,
+					    ext4_idx_pblock(path[i].p_idx++),
+					    depth - i - 1,
+					    EXT4_EX_FORCE_CACHE);
+		if (IS_ERR(bh)) {
+			ret = PTR_ERR(bh);
+			break;
+		}
+		i++;
+		path[i].p_bh = bh;
+		path[i].p_hdr = ext_block_hdr(bh);
+		path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+	}
+	ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
+out:
+	up_read(&ei->i_data_sem);
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	return ret;
+}
 
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -319,12 +617,13 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 	for (k = 0; k <= l; k++, path++) {
 		if (path->p_idx) {
 		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
-			    idx_pblock(path->p_idx));
+			    ext4_idx_pblock(path->p_idx));
 		} else if (path->p_ext) {
-			ext_debug("  %d:%d:%llu ",
+			ext_debug("  %d:[%d]%d:%llu ",
 				  le32_to_cpu(path->p_ext->ee_block),
+				  ext4_ext_is_unwritten(path->p_ext),
 				  ext4_ext_get_actual_len(path->p_ext),
-				  ext_pblock(path->p_ext));
+				  ext4_ext_pblock(path->p_ext));
 		} else
 			ext_debug("  []");
 	}
@@ -344,15 +643,52 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
 	eh = path[depth].p_hdr;
 	ex = EXT_FIRST_EXTENT(eh);
 
+	ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
+
 	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
-		ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
-			  ext4_ext_get_actual_len(ex), ext_pblock(ex));
+		ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+			  ext4_ext_is_unwritten(ex),
+			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
 	}
 	ext_debug("\n");
 }
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+			ext4_fsblk_t newblock, int level)
+{
+	int depth = ext_depth(inode);
+	struct ext4_extent *ex;
+
+	if (depth != level) {
+		struct ext4_extent_idx *idx;
+		idx = path[level].p_idx;
+		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+			ext_debug("%d: move %d:%llu in new index %llu\n", level,
+					le32_to_cpu(idx->ei_block),
+					ext4_idx_pblock(idx),
+					newblock);
+			idx++;
+		}
+
+		return;
+	}
+
+	ex = path[depth].p_ext;
+	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+				le32_to_cpu(ex->ee_block),
+				ext4_ext_pblock(ex),
+				ext4_ext_is_unwritten(ex),
+				ext4_ext_get_actual_len(ex),
+				newblock);
+		ex++;
+	}
+}
+
 #else
-#define ext4_ext_show_path(inode,path)
-#define ext4_ext_show_leaf(inode,path)
+#define ext4_ext_show_path(inode, path)
+#define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
 #endif
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -396,8 +732,8 @@ ext4_ext_binsearch_idx(struct inode *inode,
 	}
 
 	path->p_idx = l - 1;
-	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
-		  idx_pblock(path->p_idx));
+	ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
+		  ext4_idx_pblock(path->p_idx));
 
 #ifdef CHECK_BINSEARCH
 	{
@@ -408,9 +744,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
 		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
 		  if (k != 0 &&
 		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
-				printk("k=%d, ix=0x%p, first=0x%p\n", k,
-					ix, EXT_FIRST_INDEX(eh));
-				printk("%u <= %u\n",
+				printk(KERN_DEBUG "k=%d, ix=0x%p, "
+				       "first=0x%p\n", k,
+				       ix, EXT_FIRST_INDEX(eh));
+				printk(KERN_DEBUG "%u <= %u\n",
 				       le32_to_cpu(ix->ei_block),
 				       le32_to_cpu(ix[-1].ei_block));
 			}
@@ -463,9 +800,10 @@ ext4_ext_binsearch(struct inode *inode,
 	}
 
 	path->p_ext = l - 1;
-	ext_debug("  -> %d:%llu:%d ",
+	ext_debug("  -> %d:%llu:[%d]%d ",
 			le32_to_cpu(path->p_ext->ee_block),
-			ext_pblock(path->p_ext),
+			ext4_ext_pblock(path->p_ext),
+			ext4_ext_is_unwritten(path->p_ext),
 			ext4_ext_get_actual_len(path->p_ext));
 
 #ifdef CHECK_BINSEARCH
@@ -495,25 +833,22 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 	eh->eh_depth = 0;
 	eh->eh_entries = 0;
 	eh->eh_magic = EXT4_EXT_MAGIC;
-	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
 	ext4_mark_inode_dirty(handle, inode);
-	ext4_ext_invalidate_cache(inode);
 	return 0;
 }
 
 struct ext4_ext_path *
 ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
-					struct ext4_ext_path *path)
+		     struct ext4_ext_path *path, int flags)
 {
 	struct ext4_extent_header *eh;
 	struct buffer_head *bh;
 	short int depth, i, ppos = 0, alloc = 0;
+	int ret;
 
 	eh = ext_inode_hdr(inode);
 	depth = ext_depth(inode);
-	if (ext4_ext_check_header(inode, eh, depth))
-		return ERR_PTR(-EIO);
-
 
 	/* account possible depth increase */
 	if (!path) {
@@ -524,6 +859,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		alloc = 1;
 	}
 	path[0].p_hdr = eh;
+	path[0].p_bh = NULL;
 
 	i = depth;
 	/* walk through the tree */
@@ -532,32 +868,39 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
 		ext4_ext_binsearch_idx(inode, path + ppos, block);
-		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
 		path[ppos].p_depth = i;
 		path[ppos].p_ext = NULL;
 
-		bh = sb_bread(inode->i_sb, path[ppos].p_block);
-		if (!bh)
+		bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
+					    flags);
+		if (IS_ERR(bh)) {
+			ret = PTR_ERR(bh);
 			goto err;
+		}
 
 		eh = ext_block_hdr(bh);
 		ppos++;
-		BUG_ON(ppos > depth);
+		if (unlikely(ppos > depth)) {
+			put_bh(bh);
+			EXT4_ERROR_INODE(inode,
+					 "ppos %d > depth %d", ppos, depth);
+			ret = -EIO;
+			goto err;
+		}
 		path[ppos].p_bh = bh;
 		path[ppos].p_hdr = eh;
-		i--;
-
-		if (ext4_ext_check_header(inode, eh, i))
-			goto err;
 	}
 
 	path[ppos].p_depth = i;
-	path[ppos].p_hdr = eh;
 	path[ppos].p_ext = NULL;
 	path[ppos].p_idx = NULL;
 
 	/* find extent */
 	ext4_ext_binsearch(inode, path + ppos, block);
+	/* if not an empty leaf */
+	if (path[ppos].p_ext)
+		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
 
 	ext4_ext_show_path(inode, path);
 
@@ -567,7 +910,7 @@ err:
 	ext4_ext_drop_refs(path);
 	if (alloc)
 		kfree(path);
-	return ERR_PTR(-EIO);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -576,8 +919,8 @@ err:
  * check where to insert: before @curp or after @curp
  */
 static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
-				struct ext4_ext_path *curp,
-				int logical, ext4_fsblk_t ptr)
+				 struct ext4_ext_path *curp,
+				 int logical, ext4_fsblk_t ptr)
 {
 	struct ext4_extent_idx *ix;
 	int len, err;
@@ -586,39 +929,54 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 	if (err)
 		return err;
 
-	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
-	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d == ei_block %d!",
+				 logical, le32_to_cpu(curp->p_idx->ei_block));
+		return -EIO;
+	}
+
+	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
+			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
+		EXT4_ERROR_INODE(inode,
+				 "eh_entries %d >= eh_max %d!",
+				 le16_to_cpu(curp->p_hdr->eh_entries),
+				 le16_to_cpu(curp->p_hdr->eh_max));
+		return -EIO;
+	}
+
 	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
 		/* insert after */
-		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
-			len = (len - 1) * sizeof(struct ext4_extent_idx);
-			len = len < 0 ? 0 : len;
-			ext_debug("insert new index %d after: %llu. "
-					"move %d from 0x%p to 0x%p\n",
-					logical, ptr, len,
-					(curp->p_idx + 1), (curp->p_idx + 2));
-			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
-		}
+		ext_debug("insert new index %d after: %llu\n", logical, ptr);
 		ix = curp->p_idx + 1;
 	} else {
 		/* insert before */
-		len = len * sizeof(struct ext4_extent_idx);
-		len = len < 0 ? 0 : len;
-		ext_debug("insert new index %d before: %llu. "
-				"move %d from 0x%p to 0x%p\n",
-				logical, ptr, len,
-				curp->p_idx, (curp->p_idx + 1));
-		memmove(curp->p_idx + 1, curp->p_idx, len);
+		ext_debug("insert new index %d before: %llu\n", logical, ptr);
 		ix = curp->p_idx;
 	}
 
+	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
+	BUG_ON(len < 0);
+	if (len > 0) {
+		ext_debug("insert new index %d: "
+				"move %d indices from 0x%p to 0x%p\n",
+				logical, len, ix, ix + 1);
+		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
+	}
+
+	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+		return -EIO;
+	}
+
 	ix->ei_block = cpu_to_le32(logical);
 	ext4_idx_store_pblock(ix, ptr);
-	curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
+	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
 
-	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
-			     > le16_to_cpu(curp->p_hdr->eh_max));
-	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
+		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
+		return -EIO;
+	}
 
 	err = ext4_ext_dirty(handle, inode, curp);
 	ext4_std_error(inode->i_sb, err);
@@ -637,14 +995,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
  * - initializes subtree
  */
 static int ext4_ext_split(handle_t *handle, struct inode *inode,
-				struct ext4_ext_path *path,
-				struct ext4_extent *newext, int at)
+			  unsigned int flags,
+			  struct ext4_ext_path *path,
+			  struct ext4_extent *newext, int at)
 {
 	struct buffer_head *bh = NULL;
 	int depth = ext_depth(inode);
 	struct ext4_extent_header *neh;
 	struct ext4_extent_idx *fidx;
-	struct ext4_extent *ex;
 	int i = at, k, m, a;
 	ext4_fsblk_t newblock, oldblock;
 	__le32 border;
@@ -656,7 +1014,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	/* if current leaf will be split, then we should use
 	 * border from split point */
-	BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
+		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
+		return -EIO;
+	}
 	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
 		border = path[depth].p_ext[1].ee_block;
 		ext_debug("leaf will be split."
@@ -688,7 +1049,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	/* allocate all needed blocks */
 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
 	for (a = 0; a < depth - at; a++) {
-		newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+		newblock = ext4_ext_new_meta_block(handle, inode, path,
+						   newext, &err, flags);
 		if (newblock == 0)
 			goto cleanup;
 		ablocks[a] = newblock;
@@ -696,12 +1058,16 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	/* initialize new leaf */
 	newblock = ablocks[--a];
-	BUG_ON(newblock == 0);
-	bh = sb_getblk(inode->i_sb, newblock);
-	if (!bh) {
+	if (unlikely(newblock == 0)) {
+		EXT4_ERROR_INODE(inode, "newblock == 0!");
 		err = -EIO;
 		goto cleanup;
 	}
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (unlikely(!bh)) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
 	lock_buffer(bh);
 
 	err = ext4_journal_get_create_access(handle, bh);
@@ -710,39 +1076,34 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	neh = ext_block_hdr(bh);
 	neh->eh_entries = 0;
-	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 	neh->eh_magic = EXT4_EXT_MAGIC;
 	neh->eh_depth = 0;
-	ex = EXT_FIRST_EXTENT(neh);
 
 	/* move remainder of path[depth] to the new leaf */
-	BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
-	/* start copy from next extent */
-	/* TODO: we could do it by single memmove */
-	m = 0;
-	path[depth].p_ext++;
-	while (path[depth].p_ext <=
-			EXT_MAX_EXTENT(path[depth].p_hdr)) {
-		ext_debug("move %d:%llu:%d in new leaf %llu\n",
-				le32_to_cpu(path[depth].p_ext->ee_block),
-				ext_pblock(path[depth].p_ext),
-				ext4_ext_get_actual_len(path[depth].p_ext),
-				newblock);
-		/*memmove(ex++, path[depth].p_ext++,
-				sizeof(struct ext4_extent));
-		neh->eh_entries++;*/
-		path[depth].p_ext++;
-		m++;
+	if (unlikely(path[depth].p_hdr->eh_entries !=
+		     path[depth].p_hdr->eh_max)) {
+		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
+				 path[depth].p_hdr->eh_entries,
+				 path[depth].p_hdr->eh_max);
+		err = -EIO;
+		goto cleanup;
 	}
+	/* start copy from next extent */
+	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+	ext4_ext_show_move(inode, path, newblock, depth);
 	if (m) {
-		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
-		neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
+		struct ext4_extent *ex;
+		ex = EXT_FIRST_EXTENT(neh);
+		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
+		le16_add_cpu(&neh->eh_entries, m);
 	}
 
+	ext4_extent_block_csum_set(inode, neh);
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 
-	err = ext4_journal_dirty_metadata(handle, bh);
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
 	if (err)
 		goto cleanup;
 	brelse(bh);
@@ -753,8 +1114,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
 			goto cleanup;
-		path[depth].p_hdr->eh_entries =
-		     cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
+		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
 		err = ext4_ext_dirty(handle, inode, path + depth);
 		if (err)
 			goto cleanup;
@@ -763,7 +1123,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 	/* create intermediate indexes */
 	k = depth - at - 1;
-	BUG_ON(k < 0);
+	if (unlikely(k < 0)) {
+		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
+		err = -EIO;
+		goto cleanup;
+	}
 	if (k)
 		ext_debug("create %d intermediate indices\n", k);
 	/* insert new index into current index block */
@@ -773,8 +1137,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		oldblock = newblock;
 		newblock = ablocks[--a];
 		bh = sb_getblk(inode->i_sb, newblock);
-		if (!bh) {
-			err = -EIO;
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
 			goto cleanup;
 		}
 		lock_buffer(bh);
@@ -786,7 +1150,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		neh = ext_block_hdr(bh);
 		neh->eh_entries = cpu_to_le16(1);
 		neh->eh_magic = EXT4_EXT_MAGIC;
-		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
 		neh->eh_depth = cpu_to_le16(depth - i);
 		fidx = EXT_FIRST_INDEX(neh);
 		fidx->ei_block = border;
@@ -794,36 +1158,31 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 
 		ext_debug("int.index at %d (block %llu): %u -> %llu\n",
 				i, newblock, le32_to_cpu(border), oldblock);
-		/* copy indexes */
-		m = 0;
-		path[i].p_idx++;
 
+		/* move remainder of path[i] to the new index block */
+		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
+					EXT_LAST_INDEX(path[i].p_hdr))) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
+					 le32_to_cpu(path[i].p_ext->ee_block));
+			err = -EIO;
+			goto cleanup;
+		}
+		/* start copy indexes */
+		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
 		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
 				EXT_MAX_INDEX(path[i].p_hdr));
-		BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
-				EXT_LAST_INDEX(path[i].p_hdr));
-		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
-			ext_debug("%d: move %d:%llu in new index %llu\n", i,
-					le32_to_cpu(path[i].p_idx->ei_block),
-					idx_pblock(path[i].p_idx),
-					newblock);
-			/*memmove(++fidx, path[i].p_idx++,
-					sizeof(struct ext4_extent_idx));
-			neh->eh_entries++;
-			BUG_ON(neh->eh_entries > neh->eh_max);*/
-			path[i].p_idx++;
-			m++;
-		}
+		ext4_ext_show_move(inode, path, newblock, i);
 		if (m) {
-			memmove(++fidx, path[i].p_idx - m,
+			memmove(++fidx, path[i].p_idx,
 				sizeof(struct ext4_extent_idx) * m);
-			neh->eh_entries =
-				cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
+			le16_add_cpu(&neh->eh_entries, m);
 		}
+		ext4_extent_block_csum_set(inode, neh);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 
-		err = ext4_journal_dirty_metadata(handle, bh);
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (err)
 			goto cleanup;
 		brelse(bh);
@@ -834,7 +1193,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 			err = ext4_ext_get_access(handle, inode, path + i);
 			if (err)
 				goto cleanup;
-			path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
+			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
 			err = ext4_ext_dirty(handle, inode, path + i);
 			if (err)
 				goto cleanup;
@@ -859,7 +1218,8 @@ cleanup:
 		for (i = 0; i < depth; i++) {
 			if (!ablocks[i])
 				continue;
-			ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
+			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
+					 EXT4_FREE_BLOCKS_METADATA);
 		}
 	}
 	kfree(ablocks);
@@ -876,26 +1236,22 @@ cleanup:
  *   just created block
  */
 static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
-					struct ext4_ext_path *path,
-					struct ext4_extent *newext)
+				 unsigned int flags,
+				 struct ext4_extent *newext)
 {
-	struct ext4_ext_path *curp = path;
 	struct ext4_extent_header *neh;
-	struct ext4_extent_idx *fidx;
 	struct buffer_head *bh;
 	ext4_fsblk_t newblock;
 	int err = 0;
 
-	newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+	newblock = ext4_ext_new_meta_block(handle, inode, NULL,
+		newext, &err, flags);
 	if (newblock == 0)
 		return err;
 
 	bh = sb_getblk(inode->i_sb, newblock);
-	if (!bh) {
-		err = -EIO;
-		ext4_std_error(inode->i_sb, err);
-		return err;
-	}
+	if (unlikely(!bh))
+		return -ENOMEM;
 	lock_buffer(bh);
 
 	err = ext4_journal_get_create_access(handle, bh);
@@ -905,50 +1261,43 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	}
 
 	/* move top-level index/leaf into new block */
-	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+	memmove(bh->b_data, EXT4_I(inode)->i_data,
+		sizeof(EXT4_I(inode)->i_data));
 
 	/* set size of new block */
 	neh = ext_block_hdr(bh);
 	/* old root could have indexes or leaves
 	 * so calculate e_max right way */
 	if (ext_depth(inode))
-	  neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
 	else
-	  neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
 	neh->eh_magic = EXT4_EXT_MAGIC;
+	ext4_extent_block_csum_set(inode, neh);
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 
-	err = ext4_journal_dirty_metadata(handle, bh);
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
 	if (err)
 		goto out;
 
-	/* create index in new top-level index: num,max,pointer */
-	err = ext4_ext_get_access(handle, inode, curp);
-	if (err)
-		goto out;
-
-	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
-	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
-	curp->p_hdr->eh_entries = cpu_to_le16(1);
-	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
-
-	if (path[0].p_hdr->eh_depth)
-		curp->p_idx->ei_block =
-			EXT_FIRST_INDEX(path[0].p_hdr)->ei_block;
-	else
-		curp->p_idx->ei_block =
-			EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
-	ext4_idx_store_pblock(curp->p_idx, newblock);
-
+	/* Update top-level index: num,max,pointer */
 	neh = ext_inode_hdr(inode);
-	fidx = EXT_FIRST_INDEX(neh);
+	neh->eh_entries = cpu_to_le16(1);
+	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
+	if (neh->eh_depth == 0) {
+		/* Root extent block becomes index block */
+		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
+		EXT_FIRST_INDEX(neh)->ei_block =
+			EXT_FIRST_EXTENT(neh)->ee_block;
+	}
 	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
 		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
-		  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
+		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
+		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
-	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
-	err = ext4_ext_dirty(handle, inode, curp);
+	le16_add_cpu(&neh->eh_depth, 1);
+	ext4_mark_inode_dirty(handle, inode);
 out:
 	brelse(bh);
 
@@ -961,8 +1310,10 @@ out:
  * if no free index is found, then it requests in-depth growing.
  */
 static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
-					struct ext4_ext_path *path,
-					struct ext4_extent *newext)
+				    unsigned int mb_flags,
+				    unsigned int gb_flags,
+				    struct ext4_ext_path *path,
+				    struct ext4_extent *newext)
 {
 	struct ext4_ext_path *curp;
 	int depth, i, err = 0;
@@ -982,18 +1333,20 @@ repeat:
 	if (EXT_HAS_FREE_INDEX(curp)) {
 		/* if we found index with free entry, then use that
 		 * entry: create all needed subtree and add new leaf */
-		err = ext4_ext_split(handle, inode, path, newext, i);
+		err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
+		if (err)
+			goto out;
 
 		/* refill path */
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
 				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-				    path);
+				    path, gb_flags);
 		if (IS_ERR(path))
 			err = PTR_ERR(path);
 	} else {
 		/* tree is full, time to grow in depth */
-		err = ext4_ext_grow_indepth(handle, inode, path, newext);
+		err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext);
 		if (err)
 			goto out;
 
@@ -1001,7 +1354,7 @@ repeat:
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode,
 				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
-				    path);
+				    path, gb_flags);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -1029,15 +1382,18 @@ out:
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
-int
-ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
-			ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_left(struct inode *inode,
+				struct ext4_ext_path *path,
+				ext4_lblk_t *logical, ext4_fsblk_t *phys)
 {
 	struct ext4_extent_idx *ix;
 	struct ext4_extent *ex;
 	int depth, ee_len;
 
-	BUG_ON(path == NULL);
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
 	depth = path->p_depth;
 	*phys = 0;
 
@@ -1051,40 +1407,63 @@ ext4_ext_search_left(struct inode *inode, struct ext4_ext_path *path,
 	ex = path[depth].p_ext;
 	ee_len = ext4_ext_get_actual_len(ex);
 	if (*logical < le32_to_cpu(ex->ee_block)) {
-		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
+					 *logical, le32_to_cpu(ex->ee_block));
+			return -EIO;
+		}
 		while (--depth >= 0) {
 			ix = path[depth].p_idx;
-			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
+				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
+				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
+		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
+				  depth);
+				return -EIO;
+			}
 		}
 		return 0;
 	}
 
-	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
 
 	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
-	*phys = ext_pblock(ex) + ee_len - 1;
+	*phys = ext4_ext_pblock(ex) + ee_len - 1;
 	return 0;
 }
 
 /*
  * search the closest allocated block to the right for *logical
  * and returns it at @logical + it's physical address at @phys
- * if *logical is the smallest allocated block, the function
+ * if *logical is the largest allocated block, the function
  * returns 0 at @phys
  * return value contains 0 (success) or error code
  */
-int
-ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
-			ext4_lblk_t *logical, ext4_fsblk_t *phys)
+static int ext4_ext_search_right(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
+				 struct ext4_extent **ret_ex)
 {
 	struct buffer_head *bh = NULL;
 	struct ext4_extent_header *eh;
 	struct ext4_extent_idx *ix;
 	struct ext4_extent *ex;
 	ext4_fsblk_t block;
-	int depth, ee_len;
+	int depth;	/* Note, NOT eh_depth; depth from top of tree */
+	int ee_len;
 
-	BUG_ON(path == NULL);
+	if (unlikely(path == NULL)) {
+		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
+		return -EIO;
+	}
 	depth = path->p_depth;
 	*phys = 0;
 
@@ -1098,77 +1477,82 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	ex = path[depth].p_ext;
 	ee_len = ext4_ext_get_actual_len(ex);
 	if (*logical < le32_to_cpu(ex->ee_block)) {
-		BUG_ON(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex);
+		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
+			EXT4_ERROR_INODE(inode,
+					 "first_extent(path[%d].p_hdr) != ex",
+					 depth);
+			return -EIO;
+		}
 		while (--depth >= 0) {
 			ix = path[depth].p_idx;
-			BUG_ON(ix != EXT_FIRST_INDEX(path[depth].p_hdr));
+			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
+				EXT4_ERROR_INODE(inode,
+						 "ix != EXT_FIRST_INDEX *logical %d!",
+						 *logical);
+				return -EIO;
+			}
 		}
-		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext_pblock(ex);
-		return 0;
+		goto found_extent;
 	}
 
-	BUG_ON(*logical < (le32_to_cpu(ex->ee_block) + ee_len));
+	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
+		EXT4_ERROR_INODE(inode,
+				 "logical %d < ee_block %d + ee_len %d!",
+				 *logical, le32_to_cpu(ex->ee_block), ee_len);
+		return -EIO;
+	}
 
 	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
 		/* next allocated block in this leaf */
 		ex++;
-		*logical = le32_to_cpu(ex->ee_block);
-		*phys = ext_pblock(ex);
-		return 0;
+		goto found_extent;
 	}
 
 	/* go up and search for index to the right */
 	while (--depth >= 0) {
 		ix = path[depth].p_idx;
 		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
-			break;
+			goto got_index;
 	}
 
-	if (depth < 0) {
-		/* we've gone up to the root and
-		 * found no index to the right */
-		return 0;
-	}
+	/* we've gone up to the root and found no index to the right */
+	return 0;
 
+got_index:
 	/* we've found index to the right, let's
 	 * follow it and find the closest allocated
 	 * block to the right */
 	ix++;
-	block = idx_pblock(ix);
+	block = ext4_idx_pblock(ix);
 	while (++depth < path->p_depth) {
-		bh = sb_bread(inode->i_sb, block);
-		if (bh == NULL)
-			return -EIO;
+		/* subtract from p_depth to get proper eh_depth */
+		bh = read_extent_tree_block(inode, block,
+					    path->p_depth - depth, 0);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
 		eh = ext_block_hdr(bh);
-		if (ext4_ext_check_header(inode, eh, depth)) {
-			put_bh(bh);
-			return -EIO;
-		}
 		ix = EXT_FIRST_INDEX(eh);
-		block = idx_pblock(ix);
+		block = ext4_idx_pblock(ix);
 		put_bh(bh);
 	}
 
-	bh = sb_bread(inode->i_sb, block);
-	if (bh == NULL)
-		return -EIO;
+	bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	eh = ext_block_hdr(bh);
-	if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
-		put_bh(bh);
-		return -EIO;
-	}
 	ex = EXT_FIRST_EXTENT(eh);
+found_extent:
 	*logical = le32_to_cpu(ex->ee_block);
-	*phys = ext_pblock(ex);
-	put_bh(bh);
+	*phys = ext4_ext_pblock(ex);
+	*ret_ex = ex;
+	if (bh)
+		put_bh(bh);
 	return 0;
-
 }
 
 /*
  * ext4_ext_next_allocated_block:
- * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
  * NOTE: it considers block number from index entry as
  * allocated block. Thus, index entries have to be consistent
  * with leaves.
@@ -1182,12 +1566,13 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 	depth = path->p_depth;
 
 	if (depth == 0 && path->p_ext == NULL)
-		return EXT_MAX_BLOCK;
+		return EXT_MAX_BLOCKS;
 
 	while (depth >= 0) {
 		if (depth == path->p_depth) {
 			/* leaf */
-			if (path[depth].p_ext !=
+			if (path[depth].p_ext &&
+				path[depth].p_ext !=
 					EXT_LAST_EXTENT(path[depth].p_hdr))
 			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
 		} else {
@@ -1199,15 +1584,14 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 		depth--;
 	}
 
-	return EXT_MAX_BLOCK;
+	return EXT_MAX_BLOCKS;
 }
 
 /*
  * ext4_ext_next_leaf_block:
- * returns first allocated block from next leaf or EXT_MAX_BLOCK
+ * returns first allocated block from next leaf or EXT_MAX_BLOCKS
  */
-static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
-					struct ext4_ext_path *path)
+static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
 {
 	int depth;
 
@@ -1216,7 +1600,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
 
 	/* zero-tree has no leaf blocks at all */
 	if (depth == 0)
-		return EXT_MAX_BLOCK;
+		return EXT_MAX_BLOCKS;
 
 	/* go to index block */
 	depth--;
@@ -1229,7 +1613,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode,
 		depth--;
 	}
 
-	return EXT_MAX_BLOCK;
+	return EXT_MAX_BLOCKS;
 }
 
 /*
@@ -1249,8 +1633,12 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
-	BUG_ON(ex == NULL);
-	BUG_ON(eh == NULL);
+
+	if (unlikely(ex == NULL || eh == NULL)) {
+		EXT4_ERROR_INODE(inode,
+				 "ex %p == NULL or eh %p == NULL", ex, eh);
+		return -EIO;
+	}
 
 	if (depth == 0) {
 		/* there is no tree at all */
@@ -1291,24 +1679,21 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
 	return err;
 }
 
-static int
+int
 ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 				struct ext4_extent *ex2)
 {
-	unsigned short ext1_ee_len, ext2_ee_len, max_len;
+	unsigned short ext1_ee_len, ext2_ee_len;
 
 	/*
-	 * Make sure that either both extents are uninitialized, or
-	 * both are _not_.
+	 * Make sure that both extents are initialized. We don't merge
+	 * unwritten extents so that we can be sure that end_io code has
+	 * the extent that was written properly split out and conversion to
+	 * initialized is trivial.
 	 */
-	if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+	if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
 		return 0;
 
-	if (ext4_ext_is_uninitialized(ex1))
-		max_len = EXT_UNINIT_MAX_LEN;
-	else
-		max_len = EXT_INIT_MAX_LEN;
-
 	ext1_ee_len = ext4_ext_get_actual_len(ex1);
 	ext2_ee_len = ext4_ext_get_actual_len(ex2);
 
@@ -1321,14 +1706,19 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 	 * as an RO_COMPAT feature, refuse to merge to extents if
 	 * this can result in the top bit of ee_len being set.
 	 */
-	if (ext1_ee_len + ext2_ee_len > max_len)
+	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
+		return 0;
+	if (ext4_ext_is_unwritten(ex1) &&
+	    (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+	     atomic_read(&EXT4_I(inode)->i_unwritten) ||
+	     (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
 		return 0;
 #ifdef AGGRESSIVE_TEST
 	if (ext1_ee_len >= 4)
 		return 0;
 #endif
 
-	if (ext_pblock(ex1) + ext1_ee_len == ext_pblock(ex2))
+	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
 		return 1;
 	return 0;
 }
@@ -1340,14 +1730,13 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
  * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
  * 1 if they got merged.
  */
-int ext4_ext_try_to_merge(struct inode *inode,
-			  struct ext4_ext_path *path,
-			  struct ext4_extent *ex)
+static int ext4_ext_try_to_merge_right(struct inode *inode,
+				 struct ext4_ext_path *path,
+				 struct ext4_extent *ex)
 {
 	struct ext4_extent_header *eh;
 	unsigned int depth, len;
-	int merge_done = 0;
-	int uninitialized = 0;
+	int merge_done = 0, unwritten;
 
 	depth = ext_depth(inode);
 	BUG_ON(path[depth].p_hdr == NULL);
@@ -1357,30 +1746,98 @@ int ext4_ext_try_to_merge(struct inode *inode,
 		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
 			break;
 		/* merge with next extent! */
-		if (ext4_ext_is_uninitialized(ex))
-			uninitialized = 1;
+		unwritten = ext4_ext_is_unwritten(ex);
 		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
 				+ ext4_ext_get_actual_len(ex + 1));
-		if (uninitialized)
-			ext4_ext_mark_uninitialized(ex);
+		if (unwritten)
+			ext4_ext_mark_unwritten(ex);
 
 		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
 			len = (EXT_LAST_EXTENT(eh) - ex - 1)
 				* sizeof(struct ext4_extent);
 			memmove(ex + 1, ex + 2, len);
 		}
-		eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1);
+		le16_add_cpu(&eh->eh_entries, -1);
 		merge_done = 1;
 		WARN_ON(eh->eh_entries == 0);
 		if (!eh->eh_entries)
-			ext4_error(inode->i_sb, "ext4_ext_try_to_merge",
-			   "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
 	}
 
 	return merge_done;
 }
 
 /*
+ * This function does a very simple check to see if we can collapse
+ * an extent tree with a single extent tree leaf block into the inode.
+ */
+static void ext4_ext_try_to_merge_up(handle_t *handle,
+				     struct inode *inode,
+				     struct ext4_ext_path *path)
+{
+	size_t s;
+	unsigned max_root = ext4_ext_space_root(inode, 0);
+	ext4_fsblk_t blk;
+
+	if ((path[0].p_depth != 1) ||
+	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
+	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
+		return;
+
+	/*
+	 * We need to modify the block allocation bitmap and the block
+	 * group descriptor to release the extent tree block.  If we
+	 * can't get the journal credits, give up.
+	 */
+	if (ext4_journal_extend(handle, 2))
+		return;
+
+	/*
+	 * Copy the extent data up to the inode
+	 */
+	blk = ext4_idx_pblock(path[0].p_idx);
+	s = le16_to_cpu(path[1].p_hdr->eh_entries) *
+		sizeof(struct ext4_extent_idx);
+	s += sizeof(struct ext4_extent_header);
+
+	memcpy(path[0].p_hdr, path[1].p_hdr, s);
+	path[0].p_depth = 0;
+	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
+		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
+	path[0].p_hdr->eh_max = cpu_to_le16(max_root);
+
+	brelse(path[1].p_bh);
+	ext4_free_blocks(handle, inode, NULL, blk, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET |
+			 EXT4_FREE_BLOCKS_RESERVE);
+}
+
+/*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static void ext4_ext_try_to_merge(handle_t *handle,
+				  struct inode *inode,
+				  struct ext4_ext_path *path,
+				  struct ext4_extent *ex) {
+	struct ext4_extent_header *eh;
+	unsigned int depth;
+	int merge_done = 0;
+
+	depth = ext_depth(inode);
+	BUG_ON(path[depth].p_hdr == NULL);
+	eh = path[depth].p_hdr;
+
+	if (ex > EXT_FIRST_EXTENT(eh))
+		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+	if (!merge_done)
+		(void) ext4_ext_try_to_merge_right(inode, path, ex);
+
+	ext4_ext_try_to_merge_up(handle, inode, path);
+}
+
+/*
  * check if a portion of the "newext" extent overlaps with an
  * existing extent.
  *
@@ -1388,9 +1845,10 @@ int ext4_ext_try_to_merge(struct inode *inode,
  * such that there will be no overlap, and then returns 1.
  * If there is no overlap found, it returns 0.
  */
-unsigned int ext4_ext_check_overlap(struct inode *inode,
-				    struct ext4_extent *newext,
-				    struct ext4_ext_path *path)
+static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
+					   struct inode *inode,
+					   struct ext4_extent *newext,
+					   struct ext4_ext_path *path)
 {
 	ext4_lblk_t b1, b2;
 	unsigned int depth, len1;
@@ -1401,21 +1859,22 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
 	depth = ext_depth(inode);
 	if (!path[depth].p_ext)
 		goto out;
-	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
+	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
 
 	/*
 	 * get the next allocated block if the extent in the path
-	 * is before the requested block(s) 
+	 * is before the requested block(s)
 	 */
 	if (b2 < b1) {
 		b2 = ext4_ext_next_allocated_block(path);
-		if (b2 == EXT_MAX_BLOCK)
+		if (b2 == EXT_MAX_BLOCKS)
 			goto out;
+		b2 = EXT4_LBLK_CMASK(sbi, b2);
 	}
 
 	/* check for wrap through zero on extent logical start block*/
 	if (b1 + len1 < b1) {
-		len1 = EXT_MAX_BLOCK - b1;
+		len1 = EXT_MAX_BLOCKS - b1;
 		newext->ee_len = cpu_to_le16(len1);
 		ret = 1;
 	}
@@ -1437,48 +1896,104 @@ out:
  */
 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 				struct ext4_ext_path *path,
-				struct ext4_extent *newext)
+				struct ext4_extent *newext, int gb_flags)
 {
-	struct ext4_extent_header * eh;
+	struct ext4_extent_header *eh;
 	struct ext4_extent *ex, *fex;
 	struct ext4_extent *nearex; /* nearest extent */
 	struct ext4_ext_path *npath = NULL;
 	int depth, len, err;
 	ext4_lblk_t next;
-	unsigned uninitialized = 0;
+	int mb_flags = 0, unwritten;
 
-	BUG_ON(ext4_ext_get_actual_len(newext) == 0);
+	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
+		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
+		return -EIO;
+	}
 	depth = ext_depth(inode);
 	ex = path[depth].p_ext;
-	BUG_ON(path[depth].p_hdr == NULL);
+	eh = path[depth].p_hdr;
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
 
 	/* try to insert block into found extent and return */
-	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
-		ext_debug("append %d block to %d:%d (from %llu)\n",
-				ext4_ext_get_actual_len(newext),
-				le32_to_cpu(ex->ee_block),
-				ext4_ext_get_actual_len(ex), ext_pblock(ex));
-		err = ext4_ext_get_access(handle, inode, path + depth);
-		if (err)
-			return err;
+	if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
 
 		/*
-		 * ext4_can_extents_be_merged should have checked that either
-		 * both extents are uninitialized, or both aren't. Thus we
-		 * need to check only one of them here.
+		 * Try to see whether we should rather test the extent on
+		 * right from ex, or from the left of ex. This is because
+		 * ext4_ext_find_extent() can return either extent on the
+		 * left, or on the right from the searched position. This
+		 * will make merging more effective.
 		 */
-		if (ext4_ext_is_uninitialized(ex))
-			uninitialized = 1;
-		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+		if (ex < EXT_LAST_EXTENT(eh) &&
+		    (le32_to_cpu(ex->ee_block) +
+		    ext4_ext_get_actual_len(ex) <
+		    le32_to_cpu(newext->ee_block))) {
+			ex += 1;
+			goto prepend;
+		} else if ((ex > EXT_FIRST_EXTENT(eh)) &&
+			   (le32_to_cpu(newext->ee_block) +
+			   ext4_ext_get_actual_len(newext) <
+			   le32_to_cpu(ex->ee_block)))
+			ex -= 1;
+
+		/* Try to append newex to the ex */
+		if (ext4_can_extents_be_merged(inode, ex, newext)) {
+			ext_debug("append [%d]%d block to %u:[%d]%d"
+				  "(from %llu)\n",
+				  ext4_ext_is_unwritten(newext),
+				  ext4_ext_get_actual_len(newext),
+				  le32_to_cpu(ex->ee_block),
+				  ext4_ext_is_unwritten(ex),
+				  ext4_ext_get_actual_len(ex),
+				  ext4_ext_pblock(ex));
+			err = ext4_ext_get_access(handle, inode,
+						  path + depth);
+			if (err)
+				return err;
+			unwritten = ext4_ext_is_unwritten(ex);
+			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
 					+ ext4_ext_get_actual_len(newext));
-		if (uninitialized)
-			ext4_ext_mark_uninitialized(ex);
-		eh = path[depth].p_hdr;
-		nearex = ex;
-		goto merge;
+			if (unwritten)
+				ext4_ext_mark_unwritten(ex);
+			eh = path[depth].p_hdr;
+			nearex = ex;
+			goto merge;
+		}
+
+prepend:
+		/* Try to prepend newex to the ex */
+		if (ext4_can_extents_be_merged(inode, newext, ex)) {
+			ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
+				  "(from %llu)\n",
+				  le32_to_cpu(newext->ee_block),
+				  ext4_ext_is_unwritten(newext),
+				  ext4_ext_get_actual_len(newext),
+				  le32_to_cpu(ex->ee_block),
+				  ext4_ext_is_unwritten(ex),
+				  ext4_ext_get_actual_len(ex),
+				  ext4_ext_pblock(ex));
+			err = ext4_ext_get_access(handle, inode,
+						  path + depth);
+			if (err)
+				return err;
+
+			unwritten = ext4_ext_is_unwritten(ex);
+			ex->ee_block = newext->ee_block;
+			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
+			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+					+ ext4_ext_get_actual_len(newext));
+			if (unwritten)
+				ext4_ext_mark_unwritten(ex);
+			eh = path[depth].p_hdr;
+			nearex = ex;
+			goto merge;
+		}
 	}
 
-repeat:
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
@@ -1486,21 +2001,22 @@ repeat:
 
 	/* probably next leaf has space for us? */
 	fex = EXT_LAST_EXTENT(eh);
-	next = ext4_ext_next_leaf_block(inode, path);
-	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
-	    && next != EXT_MAX_BLOCK) {
-		ext_debug("next leaf block - %d\n", next);
+	next = EXT_MAX_BLOCKS;
+	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
+		next = ext4_ext_next_leaf_block(path);
+	if (next != EXT_MAX_BLOCKS) {
+		ext_debug("next leaf block - %u\n", next);
 		BUG_ON(npath != NULL);
-		npath = ext4_ext_find_extent(inode, next, NULL);
+		npath = ext4_ext_find_extent(inode, next, NULL, 0);
 		if (IS_ERR(npath))
 			return PTR_ERR(npath);
 		BUG_ON(npath->p_depth != path->p_depth);
 		eh = npath[depth].p_hdr;
 		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
-			ext_debug("next leaf isnt full(%d)\n",
+			ext_debug("next leaf isn't full(%d)\n",
 				  le16_to_cpu(eh->eh_entries));
 			path = npath;
-			goto repeat;
+			goto has_space;
 		}
 		ext_debug("next leaf has no free space(%d,%d)\n",
 			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -1510,7 +2026,10 @@ repeat:
 	 * There is no free space in the found leaf.
 	 * We're gonna add a new leaf in the tree.
 	 */
-	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+	if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+		mb_flags = EXT4_MB_USE_RESERVED;
+	err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
+				       path, newext);
 	if (err)
 		goto cleanup;
 	depth = ext_depth(inode);
@@ -1525,81 +2044,233 @@ has_space:
 
 	if (!nearex) {
 		/* there is no extent in this leaf, create first one */
-		ext_debug("first extent in the leaf: %d:%llu:%d\n",
+		ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
 				le32_to_cpu(newext->ee_block),
-				ext_pblock(newext),
+				ext4_ext_pblock(newext),
+				ext4_ext_is_unwritten(newext),
 				ext4_ext_get_actual_len(newext));
-		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
-	} else if (le32_to_cpu(newext->ee_block)
+		nearex = EXT_FIRST_EXTENT(eh);
+	} else {
+		if (le32_to_cpu(newext->ee_block)
 			   > le32_to_cpu(nearex->ee_block)) {
-/*		BUG_ON(newext->ee_block == nearex->ee_block); */
-		if (nearex != EXT_LAST_EXTENT(eh)) {
-			len = EXT_MAX_EXTENT(eh) - nearex;
-			len = (len - 1) * sizeof(struct ext4_extent);
-			len = len < 0 ? 0 : len;
-			ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
-					"move %d from 0x%p to 0x%p\n",
+			/* Insert after */
+			ext_debug("insert %u:%llu:[%d]%d before: "
+					"nearest %p\n",
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_unwritten(newext),
+					ext4_ext_get_actual_len(newext),
+					nearex);
+			nearex++;
+		} else {
+			/* Insert before */
+			BUG_ON(newext->ee_block == nearex->ee_block);
+			ext_debug("insert %u:%llu:[%d]%d after: "
+					"nearest %p\n",
 					le32_to_cpu(newext->ee_block),
-					ext_pblock(newext),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_unwritten(newext),
 					ext4_ext_get_actual_len(newext),
-					nearex, len, nearex + 1, nearex + 2);
-			memmove(nearex + 2, nearex + 1, len);
+					nearex);
+		}
+		len = EXT_LAST_EXTENT(eh) - nearex + 1;
+		if (len > 0) {
+			ext_debug("insert %u:%llu:[%d]%d: "
+					"move %d extents from 0x%p to 0x%p\n",
+					le32_to_cpu(newext->ee_block),
+					ext4_ext_pblock(newext),
+					ext4_ext_is_unwritten(newext),
+					ext4_ext_get_actual_len(newext),
+					len, nearex, nearex + 1);
+			memmove(nearex + 1, nearex,
+				len * sizeof(struct ext4_extent));
 		}
-		path[depth].p_ext = nearex + 1;
-	} else {
-		BUG_ON(newext->ee_block == nearex->ee_block);
-		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
-		len = len < 0 ? 0 : len;
-		ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
-				"move %d from 0x%p to 0x%p\n",
-				le32_to_cpu(newext->ee_block),
-				ext_pblock(newext),
-				ext4_ext_get_actual_len(newext),
-				nearex, len, nearex + 1, nearex + 2);
-		memmove(nearex + 1, nearex, len);
-		path[depth].p_ext = nearex;
 	}
 
-	eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
-	nearex = path[depth].p_ext;
+	le16_add_cpu(&eh->eh_entries, 1);
+	path[depth].p_ext = nearex;
 	nearex->ee_block = newext->ee_block;
-	ext4_ext_store_pblock(nearex, ext_pblock(newext));
+	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
 	nearex->ee_len = newext->ee_len;
 
 merge:
-	/* try to merge extents to the right */
-	ext4_ext_try_to_merge(inode, path, nearex);
+	/* try to merge extents */
+	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
+		ext4_ext_try_to_merge(handle, inode, path, nearex);
 
-	/* try to merge extents to the left */
 
 	/* time to correct all indexes above */
 	err = ext4_ext_correct_indexes(handle, inode, path);
 	if (err)
 		goto cleanup;
 
-	err = ext4_ext_dirty(handle, inode, path + depth);
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
 
 cleanup:
 	if (npath) {
 		ext4_ext_drop_refs(npath);
 		kfree(npath);
 	}
-	ext4_ext_tree_changed(inode);
-	ext4_ext_invalidate_cache(inode);
 	return err;
 }
 
-static void
-ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
-			__u32 len, ext4_fsblk_t start, int type)
+static int ext4_fill_fiemap_extents(struct inode *inode,
+				    ext4_lblk_t block, ext4_lblk_t num,
+				    struct fiemap_extent_info *fieinfo)
 {
-	struct ext4_ext_cache *cex;
-	BUG_ON(len == 0);
-	cex = &EXT4_I(inode)->i_cached_extent;
-	cex->ec_type = type;
-	cex->ec_block = block;
-	cex->ec_len = len;
-	cex->ec_start = start;
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ex;
+	struct extent_status es;
+	ext4_lblk_t next, next_del, start = 0, end = 0;
+	ext4_lblk_t last = block + num;
+	int exists, depth = 0, err = 0;
+	unsigned int flags = 0;
+	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
+
+	while (block < last && block != EXT_MAX_BLOCKS) {
+		num = last - block;
+		/* find extent for this block */
+		down_read(&EXT4_I(inode)->i_data_sem);
+
+		if (path && ext_depth(inode) != depth) {
+			/* depth was changed. we have to realloc path */
+			kfree(path);
+			path = NULL;
+		}
+
+		path = ext4_ext_find_extent(inode, block, path, 0);
+		if (IS_ERR(path)) {
+			up_read(&EXT4_I(inode)->i_data_sem);
+			err = PTR_ERR(path);
+			path = NULL;
+			break;
+		}
+
+		depth = ext_depth(inode);
+		if (unlikely(path[depth].p_hdr == NULL)) {
+			up_read(&EXT4_I(inode)->i_data_sem);
+			EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+			err = -EIO;
+			break;
+		}
+		ex = path[depth].p_ext;
+		next = ext4_ext_next_allocated_block(path);
+		ext4_ext_drop_refs(path);
+
+		flags = 0;
+		exists = 0;
+		if (!ex) {
+			/* there is no extent yet, so try to allocate
+			 * all requested space */
+			start = block;
+			end = block + num;
+		} else if (le32_to_cpu(ex->ee_block) > block) {
+			/* need to allocate space before found extent */
+			start = block;
+			end = le32_to_cpu(ex->ee_block);
+			if (block + num < end)
+				end = block + num;
+		} else if (block >= le32_to_cpu(ex->ee_block)
+					+ ext4_ext_get_actual_len(ex)) {
+			/* need to allocate space after found extent */
+			start = block;
+			end = block + num;
+			if (end >= next)
+				end = next;
+		} else if (block >= le32_to_cpu(ex->ee_block)) {
+			/*
+			 * some part of requested space is covered
+			 * by found extent
+			 */
+			start = block;
+			end = le32_to_cpu(ex->ee_block)
+				+ ext4_ext_get_actual_len(ex);
+			if (block + num < end)
+				end = block + num;
+			exists = 1;
+		} else {
+			BUG();
+		}
+		BUG_ON(end <= start);
+
+		if (!exists) {
+			es.es_lblk = start;
+			es.es_len = end - start;
+			es.es_pblk = 0;
+		} else {
+			es.es_lblk = le32_to_cpu(ex->ee_block);
+			es.es_len = ext4_ext_get_actual_len(ex);
+			es.es_pblk = ext4_ext_pblock(ex);
+			if (ext4_ext_is_unwritten(ex))
+				flags |= FIEMAP_EXTENT_UNWRITTEN;
+		}
+
+		/*
+		 * Find delayed extent and update es accordingly. We call
+		 * it even in !exists case to find out whether es is the
+		 * last existing extent or not.
+		 */
+		next_del = ext4_find_delayed_extent(inode, &es);
+		if (!exists && next_del) {
+			exists = 1;
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
+		}
+		up_read(&EXT4_I(inode)->i_data_sem);
+
+		if (unlikely(es.es_len == 0)) {
+			EXT4_ERROR_INODE(inode, "es.es_len == 0");
+			err = -EIO;
+			break;
+		}
+
+		/*
+		 * This is possible iff next == next_del == EXT_MAX_BLOCKS.
+		 * we need to check next == EXT_MAX_BLOCKS because it is
+		 * possible that an extent is with unwritten and delayed
+		 * status due to when an extent is delayed allocated and
+		 * is allocated by fallocate status tree will track both of
+		 * them in a extent.
+		 *
+		 * So we could return a unwritten and delayed extent, and
+		 * its block is equal to 'next'.
+		 */
+		if (next == next_del && next == EXT_MAX_BLOCKS) {
+			flags |= FIEMAP_EXTENT_LAST;
+			if (unlikely(next_del != EXT_MAX_BLOCKS ||
+				     next != EXT_MAX_BLOCKS)) {
+				EXT4_ERROR_INODE(inode,
+						 "next extent == %u, next "
+						 "delalloc extent = %u",
+						 next, next_del);
+				err = -EIO;
+				break;
+			}
+		}
+
+		if (exists) {
+			err = fiemap_fill_next_extent(fieinfo,
+				(__u64)es.es_lblk << blksize_bits,
+				(__u64)es.es_pblk << blksize_bits,
+				(__u64)es.es_len << blksize_bits,
+				flags);
+			if (err < 0)
+				break;
+			if (err == 1) {
+				err = 0;
+				break;
+			}
+		}
+
+		block = es.es_lblk + es.es_len;
+	}
+
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+
+	return err;
 }
 
 /*
@@ -1612,15 +2283,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 				ext4_lblk_t block)
 {
 	int depth = ext_depth(inode);
-	unsigned long len;
-	ext4_lblk_t lblock;
+	unsigned long len = 0;
+	ext4_lblk_t lblock = 0;
 	struct ext4_extent *ex;
 
 	ex = path[depth].p_ext;
 	if (ex == NULL) {
-		/* there is no extent yet, so gap is [0;-] */
-		lblock = 0;
-		len = EXT_MAX_BLOCK;
+		/*
+		 * there is no extent yet, so gap is [0;-] and we
+		 * don't cache it
+		 */
 		ext_debug("cache gap(whole file):");
 	} else if (block < le32_to_cpu(ex->ee_block)) {
 		lblock = block;
@@ -1629,6 +2301,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 				block,
 				le32_to_cpu(ex->ee_block),
 				 ext4_ext_get_actual_len(ex));
+		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+			ext4_es_insert_extent(inode, lblock, len, ~0,
+					      EXTENT_STATUS_HOLE);
 	} else if (block >= le32_to_cpu(ex->ee_block)
 			+ ext4_ext_get_actual_len(ex)) {
 		ext4_lblk_t next;
@@ -1642,135 +2317,175 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
 				block);
 		BUG_ON(next == lblock);
 		len = next - lblock;
+		if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1))
+			ext4_es_insert_extent(inode, lblock, len, ~0,
+					      EXTENT_STATUS_HOLE);
 	} else {
-		lblock = len = 0;
 		BUG();
 	}
 
 	ext_debug(" -> %u:%lu\n", lblock, len);
-	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
-}
-
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
-			struct ext4_extent *ex)
-{
-	struct ext4_ext_cache *cex;
-
-	cex = &EXT4_I(inode)->i_cached_extent;
-
-	/* has cache valid data? */
-	if (cex->ec_type == EXT4_EXT_CACHE_NO)
-		return EXT4_EXT_CACHE_NO;
-
-	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
-			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
-	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
-		ex->ee_block = cpu_to_le32(cex->ec_block);
-		ext4_ext_store_pblock(ex, cex->ec_start);
-		ex->ee_len = cpu_to_le16(cex->ec_len);
-		ext_debug("%u cached by %u:%u:%llu\n",
-				block,
-				cex->ec_block, cex->ec_len, cex->ec_start);
-		return cex->ec_type;
-	}
-
-	/* not in cache */
-	return EXT4_EXT_CACHE_NO;
 }
 
 /*
  * ext4_ext_rm_idx:
  * removes index from the index block.
- * It's used in truncate case only, thus all requests are for
- * last index in the block only.
  */
 static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
-			struct ext4_ext_path *path)
+			struct ext4_ext_path *path, int depth)
 {
-	struct buffer_head *bh;
 	int err;
 	ext4_fsblk_t leaf;
 
 	/* free index block */
-	path--;
-	leaf = idx_pblock(path->p_idx);
-	BUG_ON(path->p_hdr->eh_entries == 0);
+	depth--;
+	path = path + depth;
+	leaf = ext4_idx_pblock(path->p_idx);
+	if (unlikely(path->p_hdr->eh_entries == 0)) {
+		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
+		return -EIO;
+	}
 	err = ext4_ext_get_access(handle, inode, path);
 	if (err)
 		return err;
-	path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
+
+	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
+		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
+		len *= sizeof(struct ext4_extent_idx);
+		memmove(path->p_idx, path->p_idx + 1, len);
+	}
+
+	le16_add_cpu(&path->p_hdr->eh_entries, -1);
 	err = ext4_ext_dirty(handle, inode, path);
 	if (err)
 		return err;
 	ext_debug("index is empty, remove it, free block %llu\n", leaf);
-	bh = sb_find_get_block(inode->i_sb, leaf);
-	ext4_forget(handle, 1, inode, bh, leaf);
-	ext4_free_blocks(handle, inode, leaf, 1, 1);
+	trace_ext4_ext_rm_idx(inode, leaf);
+
+	ext4_free_blocks(handle, inode, NULL, leaf, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+
+	while (--depth >= 0) {
+		if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
+			break;
+		path--;
+		err = ext4_ext_get_access(handle, inode, path);
+		if (err)
+			break;
+		path->p_idx->ei_block = (path+1)->p_idx->ei_block;
+		err = ext4_ext_dirty(handle, inode, path);
+		if (err)
+			break;
+	}
 	return err;
 }
 
 /*
- * ext4_ext_calc_credits_for_insert:
- * This routine returns max. credits that the extent tree can consume.
- * It should be OK for low-performance paths like ->writepage()
- * To allow many writing processes to fit into a single transaction,
- * the caller should calculate credits under i_data_sem and
- * pass the actual path.
+ * ext4_ext_calc_credits_for_single_extent:
+ * This routine returns max. credits that needed to insert an extent
+ * to the extent tree.
+ * When pass the actual path, the caller should calculate credits
+ * under i_data_sem.
  */
-int ext4_ext_calc_credits_for_insert(struct inode *inode,
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
 						struct ext4_ext_path *path)
 {
-	int depth, needed;
-
 	if (path) {
+		int depth = ext_depth(inode);
+		int ret = 0;
+
 		/* probably there is space in leaf? */
-		depth = ext_depth(inode);
 		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
-				< le16_to_cpu(path[depth].p_hdr->eh_max))
-			return 1;
+				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
+
+			/*
+			 *  There are some space in the leaf tree, no
+			 *  need to account for leaf block credit
+			 *
+			 *  bitmaps and block group descriptor blocks
+			 *  and other metadata blocks still need to be
+			 *  accounted.
+			 */
+			/* 1 bitmap, 1 block group descriptor */
+			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+			return ret;
+		}
 	}
 
-	/*
-	 * given 32-bit logical block (4294967296 blocks), max. tree
-	 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
-	 * Let's also add one more level for imbalance.
-	 */
-	depth = 5;
+	return ext4_chunk_trans_blocks(inode, nrblocks);
+}
 
-	/* allocation of new data block(s) */
-	needed = 2;
+/*
+ * How many index/leaf blocks need to change/allocate to add @extents extents?
+ *
+ * If we add a single extent, then in the worse case, each tree level
+ * index/leaf need to be changed in case of the tree split.
+ *
+ * If more extents are inserted, they could cause the whole tree split more
+ * than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
+{
+	int index;
+	int depth;
 
-	/*
-	 * tree can be full, so it would need to grow in depth:
-	 * we need one credit to modify old root, credits for
-	 * new root will be added in split accounting
-	 */
-	needed += 1;
+	/* If we are converting the inline data, only one is needed here. */
+	if (ext4_has_inline_data(inode))
+		return 1;
 
-	/*
-	 * Index split can happen, we would need:
-	 *    allocate intermediate indexes (bitmap + group)
-	 *  + change two blocks at each level, but root (already included)
-	 */
-	needed += (depth * 2) + (depth * 2);
+	depth = ext_depth(inode);
 
-	/* any allocation modifies superblock */
-	needed += 1;
+	if (extents <= 1)
+		index = depth * 2;
+	else
+		index = depth * 3;
 
-	return needed;
+	return index;
+}
+
+static inline int get_default_free_blocks_flags(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+	else if (ext4_should_journal_data(inode))
+		return EXT4_FREE_BLOCKS_FORGET;
+	return 0;
 }
 
 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
-				struct ext4_extent *ex,
-				ext4_lblk_t from, ext4_lblk_t to)
+			      struct ext4_extent *ex,
+			      long long *partial_cluster,
+			      ext4_lblk_t from, ext4_lblk_t to)
 {
-	struct buffer_head *bh;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	unsigned short ee_len =  ext4_ext_get_actual_len(ex);
-	int i, metadata = 0;
+	ext4_fsblk_t pblk;
+	int flags = get_default_free_blocks_flags(inode);
+
+	/*
+	 * For bigalloc file systems, we never free a partial cluster
+	 * at the beginning of the extent.  Instead, we make a note
+	 * that we tried freeing the cluster, and check to see if we
+	 * need to free it on a subsequent call to ext4_remove_blocks,
+	 * or at the end of the ext4_truncate() operation.
+	 */
+	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
+
+	trace_ext4_remove_blocks(inode, ex, from, to, *partial_cluster);
+	/*
+	 * If we have a partial cluster, and it's different from the
+	 * cluster of the last block, we need to explicitly free the
+	 * partial cluster here.
+	 */
+	pblk = ext4_ext_pblock(ex) + ee_len - 1;
+	if ((*partial_cluster > 0) &&
+	    (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(sbi, *partial_cluster),
+				 sbi->s_cluster_ratio, flags);
+		*partial_cluster = 0;
+	}
 
-	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		metadata = 1;
 #ifdef EXTENTS_STATS
 	{
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -1790,136 +2505,236 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 	    && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
 		/* tail removal */
 		ext4_lblk_t num;
-		ext4_fsblk_t start;
+		unsigned int unaligned;
 
 		num = le32_to_cpu(ex->ee_block) + ee_len - from;
-		start = ext_pblock(ex) + ee_len - num;
-		ext_debug("free last %u blocks starting %llu\n", num, start);
-		for (i = 0; i < num; i++) {
-			bh = sb_find_get_block(inode->i_sb, start + i);
-			ext4_forget(handle, 0, inode, bh, start + i);
-		}
-		ext4_free_blocks(handle, inode, start, num, metadata);
-	} else if (from == le32_to_cpu(ex->ee_block)
-		   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
-		printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
-			from, to, le32_to_cpu(ex->ee_block), ee_len);
-	} else {
-		printk(KERN_INFO "strange request: removal(2) "
-				"%u-%u from %u:%u\n",
-				from, to, le32_to_cpu(ex->ee_block), ee_len);
-	}
+		pblk = ext4_ext_pblock(ex) + ee_len - num;
+		/*
+		 * Usually we want to free partial cluster at the end of the
+		 * extent, except for the situation when the cluster is still
+		 * used by any other extent (partial_cluster is negative).
+		 */
+		if (*partial_cluster < 0 &&
+		    -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
+			flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
+
+		ext_debug("free last %u blocks starting %llu partial %lld\n",
+			  num, pblk, *partial_cluster);
+		ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
+		/*
+		 * If the block range to be freed didn't start at the
+		 * beginning of a cluster, and we removed the entire
+		 * extent and the cluster is not used by any other extent,
+		 * save the partial cluster here, since we might need to
+		 * delete if we determine that the truncate operation has
+		 * removed all of the blocks in the cluster.
+		 *
+		 * On the other hand, if we did not manage to free the whole
+		 * extent, we have to mark the cluster as used (store negative
+		 * cluster number in partial_cluster).
+		 */
+		unaligned = EXT4_PBLK_COFF(sbi, pblk);
+		if (unaligned && (ee_len == num) &&
+		    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
+			*partial_cluster = EXT4_B2C(sbi, pblk);
+		else if (unaligned)
+			*partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
+		else if (*partial_cluster > 0)
+			*partial_cluster = 0;
+	} else
+		ext4_error(sbi->s_sb, "strange request: removal(2) "
+			   "%u-%u from %u:%u\n",
+			   from, to, le32_to_cpu(ex->ee_block), ee_len);
 	return 0;
 }
 
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode:  The files inode
+ * @path:   The path to the leaf
+ * @partial_cluster: The cluster which we'll have to free if all extents
+ *                   has been released from it. It gets negative in case
+ *                   that the cluster is still used.
+ * @start:  The first block to remove
+ * @end:   The last block to remove
+ */
 static int
 ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
-		struct ext4_ext_path *path, ext4_lblk_t start)
+		 struct ext4_ext_path *path,
+		 long long *partial_cluster,
+		 ext4_lblk_t start, ext4_lblk_t end)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	int err = 0, correct_index = 0;
 	int depth = ext_depth(inode), credits;
 	struct ext4_extent_header *eh;
-	ext4_lblk_t a, b, block;
+	ext4_lblk_t a, b;
 	unsigned num;
 	ext4_lblk_t ex_ee_block;
 	unsigned short ex_ee_len;
-	unsigned uninitialized = 0;
+	unsigned unwritten = 0;
 	struct ext4_extent *ex;
+	ext4_fsblk_t pblk;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
-	ext_debug("truncate since %u in leaf\n", start);
+	ext_debug("truncate since %u in leaf to %u\n", start, end);
 	if (!path[depth].p_hdr)
 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
 	eh = path[depth].p_hdr;
-	BUG_ON(eh == NULL);
-
+	if (unlikely(path[depth].p_hdr == NULL)) {
+		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
+		return -EIO;
+	}
 	/* find where to start removing */
-	ex = EXT_LAST_EXTENT(eh);
+	ex = path[depth].p_ext;
+	if (!ex)
+		ex = EXT_LAST_EXTENT(eh);
 
 	ex_ee_block = le32_to_cpu(ex->ee_block);
-	if (ext4_ext_is_uninitialized(ex))
-		uninitialized = 1;
 	ex_ee_len = ext4_ext_get_actual_len(ex);
 
+	/*
+	 * If we're starting with an extent other than the last one in the
+	 * node, we need to see if it shares a cluster with the extent to
+	 * the right (towards the end of the file). If its leftmost cluster
+	 * is this extent's rightmost cluster and it is not cluster aligned,
+	 * we'll mark it as a partial that is not to be deallocated.
+	 */
+
+	if (ex != EXT_LAST_EXTENT(eh)) {
+		ext4_fsblk_t current_pblk, right_pblk;
+		long long current_cluster, right_cluster;
+
+		current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+		current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+		right_pblk = ext4_ext_pblock(ex + 1);
+		right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+		if (current_cluster == right_cluster &&
+			EXT4_PBLK_COFF(sbi, right_pblk))
+			*partial_cluster = -right_cluster;
+	}
+
+	trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
+
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
-		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+
+		if (ext4_ext_is_unwritten(ex))
+			unwritten = 1;
+		else
+			unwritten = 0;
+
+		ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+			  unwritten, ex_ee_len);
 		path[depth].p_ext = ex;
 
 		a = ex_ee_block > start ? ex_ee_block : start;
-		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
-			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+		b = ex_ee_block+ex_ee_len - 1 < end ?
+			ex_ee_block+ex_ee_len - 1 : end;
 
 		ext_debug("  border %u:%u\n", a, b);
 
-		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
-			block = 0;
-			num = 0;
-			BUG();
+		/* If this extent is beyond the end of the hole, skip it */
+		if (end < ex_ee_block) {
+			/*
+			 * We're going to skip this extent and move to another,
+			 * so if this extent is not cluster aligned we have
+			 * to mark the current cluster as used to avoid
+			 * accidentally freeing it later on
+			 */
+			pblk = ext4_ext_pblock(ex);
+			if (EXT4_PBLK_COFF(sbi, pblk))
+				*partial_cluster =
+					-((long long)EXT4_B2C(sbi, pblk));
+			ex--;
+			ex_ee_block = le32_to_cpu(ex->ee_block);
+			ex_ee_len = ext4_ext_get_actual_len(ex);
+			continue;
+		} else if (b != ex_ee_block + ex_ee_len - 1) {
+			EXT4_ERROR_INODE(inode,
+					 "can not handle truncate %u:%u "
+					 "on extent %u:%u",
+					 start, end, ex_ee_block,
+					 ex_ee_block + ex_ee_len - 1);
+			err = -EIO;
+			goto out;
 		} else if (a != ex_ee_block) {
 			/* remove tail of the extent */
-			block = ex_ee_block;
-			num = a - block;
-		} else if (b != ex_ee_block + ex_ee_len - 1) {
-			/* remove head of the extent */
-			block = a;
-			num = b - a;
-			/* there is no "make a hole" API yet */
-			BUG();
+			num = a - ex_ee_block;
 		} else {
 			/* remove whole extent: excellent! */
-			block = ex_ee_block;
 			num = 0;
-			BUG_ON(a != ex_ee_block);
-			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
 		}
-
-		/* at present, extent can't cross block group: */
-		/* leaf + bitmap + group desc + sb + inode */
-		credits = 5;
+		/*
+		 * 3 for leaf, sb, and inode plus 2 (bmap and group
+		 * descriptor) for each block group; assume two block
+		 * groups plus ex_ee_len/blocks_per_block_group for
+		 * the worst case
+		 */
+		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
 		if (ex == EXT_FIRST_EXTENT(eh)) {
 			correct_index = 1;
 			credits += (ext_depth(inode)) + 1;
 		}
-#ifdef CONFIG_QUOTA
-		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
+		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
 
-		handle = ext4_ext_journal_restart(handle, credits);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
+		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+		if (err)
 			goto out;
-		}
 
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
 			goto out;
 
-		err = ext4_remove_blocks(handle, inode, ex, a, b);
+		err = ext4_remove_blocks(handle, inode, ex, partial_cluster,
+					 a, b);
 		if (err)
 			goto out;
 
-		if (num == 0) {
+		if (num == 0)
 			/* this extent is removed; mark slot entirely unused */
 			ext4_ext_store_pblock(ex, 0);
-			eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
-		}
 
-		ex->ee_block = cpu_to_le32(block);
 		ex->ee_len = cpu_to_le16(num);
 		/*
-		 * Do not mark uninitialized if all the blocks in the
+		 * Do not mark unwritten if all the blocks in the
 		 * extent have been removed.
 		 */
-		if (uninitialized && num)
-			ext4_ext_mark_uninitialized(ex);
+		if (unwritten && num)
+			ext4_ext_mark_unwritten(ex);
+		/*
+		 * If the extent was completely released,
+		 * we need to remove it from the leaf
+		 */
+		if (num == 0) {
+			if (end != EXT_MAX_BLOCKS - 1) {
+				/*
+				 * For hole punching, we need to scoot all the
+				 * extents up when an extent is removed so that
+				 * we dont have blank extents in the middle
+				 */
+				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+					sizeof(struct ext4_extent));
+
+				/* Now get rid of the one at the end */
+				memset(EXT_LAST_EXTENT(eh), 0,
+					sizeof(struct ext4_extent));
+			}
+			le16_add_cpu(&eh->eh_entries, -1);
+		} else if (*partial_cluster > 0)
+			*partial_cluster = 0;
 
 		err = ext4_ext_dirty(handle, inode, path + depth);
 		if (err)
 			goto out;
 
-		ext_debug("new extent: %u:%u:%llu\n", block, num,
-				ext_pblock(ex));
+		ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num,
+				ext4_ext_pblock(ex));
 		ex--;
 		ex_ee_block = le32_to_cpu(ex->ee_block);
 		ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -1928,10 +2743,30 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	if (correct_index && eh->eh_entries)
 		err = ext4_ext_correct_indexes(handle, inode, path);
 
+	/*
+	 * If there's a partial cluster and at least one extent remains in
+	 * the leaf, free the partial cluster if it isn't shared with the
+	 * current extent.  If there's a partial cluster and no extents
+	 * remain in the leaf, it can't be freed here.  It can only be
+	 * freed when it's possible to determine if it's not shared with
+	 * any other extent - when the next leaf is processed or when space
+	 * removal is complete.
+	 */
+	if (*partial_cluster > 0 && eh->eh_entries &&
+	    (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
+	     *partial_cluster)) {
+		int flags = get_default_free_blocks_flags(inode);
+
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(sbi, *partial_cluster),
+				 sbi->s_cluster_ratio, flags);
+		*partial_cluster = 0;
+	}
+
 	/* if this leaf is free, then we should
 	 * remove it from index block above */
 	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
-		err = ext4_ext_rm_idx(handle, inode, path + depth);
+		err = ext4_ext_rm_idx(handle, inode, path, depth);
 
 out:
 	return err;
@@ -1958,43 +2793,122 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
 	return 1;
 }
 
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+			  ext4_lblk_t end)
 {
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
-	struct ext4_ext_path *path;
+	struct ext4_ext_path *path = NULL;
+	long long partial_cluster = 0;
 	handle_t *handle;
 	int i = 0, err = 0;
 
-	ext_debug("truncate since %u\n", start);
+	ext_debug("truncate since %u to %u\n", start, end);
 
 	/* probably first extent we're gonna free will be last in block */
-	handle = ext4_journal_start(inode, depth + 1);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, depth + 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
-	ext4_ext_invalidate_cache(inode);
+again:
+	trace_ext4_ext_remove_space(inode, start, end, depth);
 
 	/*
+	 * Check if we are removing extents inside the extent tree. If that
+	 * is the case, we are going to punch a hole inside the extent tree
+	 * so we have to check whether we need to split the extent covering
+	 * the last block to remove so we can easily remove the part of it
+	 * in ext4_ext_rm_leaf().
+	 */
+	if (end < EXT_MAX_BLOCKS - 1) {
+		struct ext4_extent *ex;
+		ext4_lblk_t ee_block;
+
+		/* find extent for this block */
+		path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
+		if (IS_ERR(path)) {
+			ext4_journal_stop(handle);
+			return PTR_ERR(path);
+		}
+		depth = ext_depth(inode);
+		/* Leaf not may not exist only if inode has no blocks at all */
+		ex = path[depth].p_ext;
+		if (!ex) {
+			if (depth) {
+				EXT4_ERROR_INODE(inode,
+						 "path[%d].p_hdr == NULL",
+						 depth);
+				err = -EIO;
+			}
+			goto out;
+		}
+
+		ee_block = le32_to_cpu(ex->ee_block);
+
+		/*
+		 * See if the last block is inside the extent, if so split
+		 * the extent at 'end' block so we can easily remove the
+		 * tail of the first part of the split extent in
+		 * ext4_ext_rm_leaf().
+		 */
+		if (end >= ee_block &&
+		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+			int split_flag = 0;
+
+			if (ext4_ext_is_unwritten(ex))
+				split_flag = EXT4_EXT_MARK_UNWRIT1 |
+					     EXT4_EXT_MARK_UNWRIT2;
+
+			/*
+			 * Split the extent in two so that 'end' is the last
+			 * block in the first new extent. Also we should not
+			 * fail removing space due to ENOSPC so try to use
+			 * reserved block if that happens.
+			 */
+			err = ext4_split_extent_at(handle, inode, path,
+					end + 1, split_flag,
+					EXT4_EX_NOCACHE |
+					EXT4_GET_BLOCKS_PRE_IO |
+					EXT4_GET_BLOCKS_METADATA_NOFAIL);
+
+			if (err < 0)
+				goto out;
+		}
+	}
+	/*
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
 	 */
-	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
-	if (path == NULL) {
-		ext4_journal_stop(handle);
-		return -ENOMEM;
-	}
-	path[0].p_hdr = ext_inode_hdr(inode);
-	if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
-		err = -EIO;
-		goto out;
+	depth = ext_depth(inode);
+	if (path) {
+		int k = i = depth;
+		while (--k > 0)
+			path[k].p_block =
+				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
+	} else {
+		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
+			       GFP_NOFS);
+		if (path == NULL) {
+			ext4_journal_stop(handle);
+			return -ENOMEM;
+		}
+		path[0].p_depth = depth;
+		path[0].p_hdr = ext_inode_hdr(inode);
+		i = 0;
+
+		if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
+			err = -EIO;
+			goto out;
+		}
 	}
-	path[0].p_depth = depth;
+	err = 0;
 
 	while (i >= 0 && err == 0) {
 		if (i == depth) {
 			/* this is leaf block */
-			err = ext4_ext_rm_leaf(handle, inode, path, start);
+			err = ext4_ext_rm_leaf(handle, inode, path,
+					       &partial_cluster, start,
+					       end);
 			/* root level has p_bh == NULL, brelse() eats this */
 			brelse(path[i].p_bh);
 			path[i].p_bh = NULL;
@@ -2027,23 +2941,23 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 			struct buffer_head *bh;
 			/* go to the next level */
 			ext_debug("move to level %d (block %llu)\n",
-				  i + 1, idx_pblock(path[i].p_idx));
+				  i + 1, ext4_idx_pblock(path[i].p_idx));
 			memset(path + i + 1, 0, sizeof(*path));
-			bh = sb_bread(sb, idx_pblock(path[i].p_idx));
-			if (!bh) {
+			bh = read_extent_tree_block(inode,
+				ext4_idx_pblock(path[i].p_idx), depth - i - 1,
+				EXT4_EX_NOCACHE);
+			if (IS_ERR(bh)) {
 				/* should we reset i_size? */
-				err = -EIO;
+				err = PTR_ERR(bh);
 				break;
 			}
+			/* Yield here to deal with large extent trees.
+			 * Should be a no-op if we did IO above. */
+			cond_resched();
 			if (WARN_ON(i + 1 > depth)) {
 				err = -EIO;
 				break;
 			}
-			if (ext4_ext_check_header(inode, ext_block_hdr(bh),
-							depth - i - 1)) {
-				err = -EIO;
-				break;
-			}
 			path[i + 1].p_bh = bh;
 
 			/* save actual number of indexes since this
@@ -2056,7 +2970,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 				/* index is empty, remove it;
 				 * handle must be already prepared by the
 				 * truncatei_leaf() */
-				err = ext4_ext_rm_idx(handle, inode, path + i);
+				err = ext4_ext_rm_idx(handle, inode, path, i);
 			}
 			/* root level has p_bh == NULL, brelse() eats this */
 			brelse(path[i].p_bh);
@@ -2066,6 +2980,21 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		}
 	}
 
+	trace_ext4_ext_remove_space_done(inode, start, end, depth,
+			partial_cluster, path->p_hdr->eh_entries);
+
+	/* If we still have something in the partial cluster and we have removed
+	 * even the first extent, then we should free the blocks in the partial
+	 * cluster as well. */
+	if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
+		int flags = get_default_free_blocks_flags(inode);
+
+		ext4_free_blocks(handle, inode, NULL,
+				 EXT4_C2B(EXT4_SB(sb), partial_cluster),
+				 EXT4_SB(sb)->s_cluster_ratio, flags);
+		partial_cluster = 0;
+	}
+
 	/* TODO: flexible tree reduction should be here */
 	if (path->p_hdr->eh_entries == 0) {
 		/*
@@ -2076,14 +3005,17 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		if (err == 0) {
 			ext_inode_hdr(inode)->eh_depth = 0;
 			ext_inode_hdr(inode)->eh_max =
-				cpu_to_le16(ext4_ext_space_root(inode));
+				cpu_to_le16(ext4_ext_space_root(inode, 0));
 			err = ext4_ext_dirty(handle, inode, path);
 		}
 	}
 out:
-	ext4_ext_tree_changed(inode);
 	ext4_ext_drop_refs(path);
 	kfree(path);
+	if (err == -EAGAIN) {
+		path = NULL;
+		goto again;
+	}
 	ext4_journal_stop(handle);
 
 	return err;
@@ -2098,18 +3030,20 @@ void ext4_ext_init(struct super_block *sb)
 	 * possible initialization would be here
 	 */
 
-	if (test_opt(sb, EXTENTS)) {
-		printk("EXT4-fs: file extents enabled");
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
+		printk(KERN_INFO "EXT4-fs: file extents enabled"
 #ifdef AGGRESSIVE_TEST
-		printk(", aggressive tests");
+		       ", aggressive tests"
 #endif
 #ifdef CHECK_BINSEARCH
-		printk(", check binsearch");
+		       ", check binsearch"
 #endif
 #ifdef EXTENTS_STATS
-		printk(", stats");
+		       ", stats"
+#endif
+		       "\n");
 #endif
-		printk("\n");
 #ifdef EXTENTS_STATS
 		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
 		EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2123,7 +3057,7 @@ void ext4_ext_init(struct super_block *sb)
  */
 void ext4_ext_release(struct super_block *sb)
 {
-	if (!test_opt(sb, EXTENTS))
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
 		return;
 
 #ifdef EXTENTS_STATS
@@ -2138,155 +3072,1175 @@ void ext4_ext_release(struct super_block *sb)
 #endif
 }
 
+static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+{
+	ext4_lblk_t  ee_block;
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len;
+
+	ee_block  = le32_to_cpu(ex->ee_block);
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext4_ext_pblock(ex);
+
+	if (ee_len == 0)
+		return 0;
+
+	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+				     EXTENT_STATUS_WRITTEN);
+}
+
+/* FIXME!! we need to try to merge to left or right after zero-out  */
+static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+	ext4_fsblk_t ee_pblock;
+	unsigned int ee_len;
+	int ret;
+
+	ee_len    = ext4_ext_get_actual_len(ex);
+	ee_pblock = ext4_ext_pblock(ex);
+
+	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
 /*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
- * to an uninitialized extent. It may result in splitting the uninitialized
- * extent into multiple extents (upto three - one initialized and two
- * uninitialized).
- * There are three possibilities:
- *   a> There is no split required: Entire extent should be initialized
- *   b> Splits in two extents: Write is happening at either end of the extent
- *   c> Splits in three extents: Somone is writing in middle of the extent
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ *		 the states(init or unwritten) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ *  a> the extent are splitted into two extent.
+ *  b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
  */
-static int ext4_ext_convert_to_initialized(handle_t *handle,
-						struct inode *inode,
-						struct ext4_ext_path *path,
-						ext4_lblk_t iblock,
-						unsigned long max_blocks)
+static int ext4_split_extent_at(handle_t *handle,
+			     struct inode *inode,
+			     struct ext4_ext_path *path,
+			     ext4_lblk_t split,
+			     int split_flag,
+			     int flags)
 {
-	struct ext4_extent *ex, newex;
-	struct ext4_extent *ex1 = NULL;
-	struct ext4_extent *ex2 = NULL;
-	struct ext4_extent *ex3 = NULL;
-	struct ext4_extent_header *eh;
-	ext4_lblk_t ee_block;
-	unsigned int allocated, ee_len, depth;
 	ext4_fsblk_t newblock;
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex, newex, orig_ex, zero_ex;
+	struct ext4_extent *ex2 = NULL;
+	unsigned int ee_len, depth;
 	int err = 0;
-	int ret = 0;
+
+	BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
+	       (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
+
+	ext_debug("ext4_split_extents_at: inode %lu, logical"
+		"block %llu\n", inode->i_ino, (unsigned long long)split);
+
+	ext4_ext_show_leaf(inode, path);
 
 	depth = ext_depth(inode);
-	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
-	allocated = ee_len - (iblock - ee_block);
-	newblock = iblock - ee_block + ext_pblock(ex);
-	ex2 = ex;
+	newblock = split - ee_block + ext4_ext_pblock(ex);
+
+	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+	BUG_ON(!ext4_ext_is_unwritten(ex) &&
+	       split_flag & (EXT4_EXT_MAY_ZEROOUT |
+			     EXT4_EXT_MARK_UNWRIT1 |
+			     EXT4_EXT_MARK_UNWRIT2));
 
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
 
-	/* ex1: ee_block to iblock - 1 : uninitialized */
-	if (iblock > ee_block) {
-		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
-		ext4_ext_mark_uninitialized(ex1);
-		ex2 = &newex;
+	if (split == ee_block) {
+		/*
+		 * case b: block @split is the block that the extent begins with
+		 * then we just change the state of the extent, and splitting
+		 * is not needed.
+		 */
+		if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+			ext4_ext_mark_unwritten(ex);
+		else
+			ext4_ext_mark_initialized(ex);
+
+		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+			ext4_ext_try_to_merge(handle, inode, path, ex);
+
+		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+		goto out;
 	}
+
+	/* case a */
+	memcpy(&orig_ex, ex, sizeof(orig_ex));
+	ex->ee_len = cpu_to_le16(split - ee_block);
+	if (split_flag & EXT4_EXT_MARK_UNWRIT1)
+		ext4_ext_mark_unwritten(ex);
+
 	/*
-	 * for sanity, update the length of the ex2 extent before
-	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
-	 * overlap of blocks.
+	 * path may lead to new leaf, not to original leaf any more
+	 * after ext4_ext_insert_extent() returns,
 	 */
-	if (!ex1 && allocated > max_blocks)
-		ex2->ee_len = cpu_to_le16(max_blocks);
-	/* ex3: to ee_block + ee_len : uninitialised */
-	if (allocated > max_blocks) {
-		unsigned int newdepth;
-		ex3 = &newex;
-		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
-		ext4_ext_store_pblock(ex3, newblock + max_blocks);
-		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
-		ext4_ext_mark_uninitialized(ex3);
-		err = ext4_ext_insert_extent(handle, inode, path, ex3);
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	if (err)
+		goto fix_extent_len;
+
+	ex2 = &newex;
+	ex2->ee_block = cpu_to_le32(split);
+	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
+	ext4_ext_store_pblock(ex2, newblock);
+	if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+		ext4_ext_mark_unwritten(ex2);
+
+	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+	if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
+			if (split_flag & EXT4_EXT_DATA_VALID1) {
+				err = ext4_ext_zeroout(inode, ex2);
+				zero_ex.ee_block = ex2->ee_block;
+				zero_ex.ee_len = cpu_to_le16(
+						ext4_ext_get_actual_len(ex2));
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex2));
+			} else {
+				err = ext4_ext_zeroout(inode, ex);
+				zero_ex.ee_block = ex->ee_block;
+				zero_ex.ee_len = cpu_to_le16(
+						ext4_ext_get_actual_len(ex));
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex));
+			}
+		} else {
+			err = ext4_ext_zeroout(inode, &orig_ex);
+			zero_ex.ee_block = orig_ex.ee_block;
+			zero_ex.ee_len = cpu_to_le16(
+						ext4_ext_get_actual_len(&orig_ex));
+			ext4_ext_store_pblock(&zero_ex,
+					      ext4_ext_pblock(&orig_ex));
+		}
+
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_len = cpu_to_le16(ee_len);
+		ext4_ext_try_to_merge(handle, inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+		if (err)
+			goto fix_extent_len;
+
+		/* update extent status tree */
+		err = ext4_zeroout_es(inode, &zero_ex);
+
+		goto out;
+	} else if (err)
+		goto fix_extent_len;
+
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+
+fix_extent_len:
+	ex->ee_len = orig_ex.ee_len;
+	ext4_ext_dirty(handle, inode, path + depth);
+	return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (up to three)
+ * There are three possibilities:
+ *   a> There is no split required
+ *   b> Splits in two extents: Split is happening at either end of the extent
+ *   c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+			      struct inode *inode,
+			      struct ext4_ext_path *path,
+			      struct ext4_map_blocks *map,
+			      int split_flag,
+			      int flags)
+{
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex;
+	unsigned int ee_len, depth;
+	int err = 0;
+	int unwritten;
+	int split_flag1, flags1;
+	int allocated = map->m_len;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	unwritten = ext4_ext_is_unwritten(ex);
+
+	if (map->m_lblk + map->m_len < ee_block + ee_len) {
+		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
+		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+		if (unwritten)
+			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
+				       EXT4_EXT_MARK_UNWRIT2;
+		if (split_flag & EXT4_EXT_DATA_VALID2)
+			split_flag1 |= EXT4_EXT_DATA_VALID1;
+		err = ext4_split_extent_at(handle, inode, path,
+				map->m_lblk + map->m_len, split_flag1, flags1);
+		if (err)
+			goto out;
+	} else {
+		allocated = ee_len - (map->m_lblk - ee_block);
+	}
+	/*
+	 * Update path is required because previous ext4_split_extent_at() may
+	 * result in split of original leaf or extent zeroout.
+	 */
+	ext4_ext_drop_refs(path);
+	path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	if (!ex) {
+		EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+				 (unsigned long) map->m_lblk);
+		return -EIO;
+	}
+	unwritten = ext4_ext_is_unwritten(ex);
+	split_flag1 = 0;
+
+	if (map->m_lblk >= ee_block) {
+		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
+		if (unwritten) {
+			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
+			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
+						     EXT4_EXT_MARK_UNWRIT2);
+		}
+		err = ext4_split_extent_at(handle, inode, path,
+				map->m_lblk, split_flag1, flags);
 		if (err)
 			goto out;
+	}
+
+	ext4_ext_show_leaf(inode, path);
+out:
+	return err ? err : allocated;
+}
+
+/*
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
+ * to an unwritten extent. It may result in splitting the unwritten
+ * extent into multiple extents (up to three - one initialized and two
+ * unwritten).
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be initialized
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ *  - The extent pointed to by 'path' is unwritten.
+ *  - The extent pointed to by 'path' contains a superset
+ *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ *  - the returned value is the number of blocks beyond map->l_lblk
+ *    that are allocated and initialized.
+ *    It is guaranteed to be >= map->m_len.
+ */
+static int ext4_ext_convert_to_initialized(handle_t *handle,
+					   struct inode *inode,
+					   struct ext4_map_blocks *map,
+					   struct ext4_ext_path *path,
+					   int flags)
+{
+	struct ext4_sb_info *sbi;
+	struct ext4_extent_header *eh;
+	struct ext4_map_blocks split_map;
+	struct ext4_extent zero_ex;
+	struct ext4_extent *ex, *abut_ex;
+	ext4_lblk_t ee_block, eof_block;
+	unsigned int ee_len, depth, map_len = map->m_len;
+	int allocated = 0, max_zeroout = 0;
+	int err = 0;
+	int split_flag = 0;
+
+	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)map->m_lblk, map_len);
+
+	sbi = EXT4_SB(inode->i_sb);
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map_len)
+		eof_block = map->m_lblk + map_len;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	zero_ex.ee_len = 0;
+
+	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+	/* Pre-conditions */
+	BUG_ON(!ext4_ext_is_unwritten(ex));
+	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+
+	/*
+	 * Attempt to transfer newly initialized blocks from the currently
+	 * unwritten extent to its neighbor. This is much cheaper
+	 * than an insertion followed by a merge as those involve costly
+	 * memmove() calls. Transferring to the left is the common case in
+	 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
+	 * followed by append writes.
+	 *
+	 * Limitations of the current logic:
+	 *  - L1: we do not deal with writes covering the whole extent.
+	 *    This would require removing the extent if the transfer
+	 *    is possible.
+	 *  - L2: we only attempt to merge with an extent stored in the
+	 *    same extent tree node.
+	 */
+	if ((map->m_lblk == ee_block) &&
+		/* See if we can merge left */
+		(map_len < ee_len) &&		/*L1*/
+		(ex > EXT_FIRST_EXTENT(eh))) {	/*L2*/
+		ext4_lblk_t prev_lblk;
+		ext4_fsblk_t prev_pblk, ee_pblk;
+		unsigned int prev_len;
+
+		abut_ex = ex - 1;
+		prev_lblk = le32_to_cpu(abut_ex->ee_block);
+		prev_len = ext4_ext_get_actual_len(abut_ex);
+		prev_pblk = ext4_ext_pblock(abut_ex);
+		ee_pblk = ext4_ext_pblock(ex);
+
 		/*
-		 * The depth, and hence eh & ex might change
-		 * as part of the insert above.
+		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+		 * upon those conditions:
+		 * - C1: abut_ex is initialized,
+		 * - C2: abut_ex is logically abutting ex,
+		 * - C3: abut_ex is physically abutting ex,
+		 * - C4: abut_ex can receive the additional blocks without
+		 *   overflowing the (initialized) length limit.
 		 */
-		newdepth = ext_depth(inode);
-		if (newdepth != depth) {
-			depth = newdepth;
-			ext4_ext_drop_refs(path);
-			path = ext4_ext_find_extent(inode, iblock, path);
-			if (IS_ERR(path)) {
-				err = PTR_ERR(path);
+		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
+			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
+			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
+			(prev_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
+			err = ext4_ext_get_access(handle, inode, path + depth);
+			if (err)
 				goto out;
-			}
-			eh = path[depth].p_hdr;
-			ex = path[depth].p_ext;
-			if (ex2 != &newex)
-				ex2 = ex;
 
+			trace_ext4_ext_convert_to_initialized_fastpath(inode,
+				map, ex, abut_ex);
+
+			/* Shift the start of ex by 'map_len' blocks */
+			ex->ee_block = cpu_to_le32(ee_block + map_len);
+			ext4_ext_store_pblock(ex, ee_pblk + map_len);
+			ex->ee_len = cpu_to_le16(ee_len - map_len);
+			ext4_ext_mark_unwritten(ex); /* Restore the flag */
+
+			/* Extend abut_ex by 'map_len' blocks */
+			abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
+
+			/* Result: number of initialized blocks past m_lblk */
+			allocated = map_len;
+		}
+	} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+		   (map_len < ee_len) &&	/*L1*/
+		   ex < EXT_LAST_EXTENT(eh)) {	/*L2*/
+		/* See if we can merge right */
+		ext4_lblk_t next_lblk;
+		ext4_fsblk_t next_pblk, ee_pblk;
+		unsigned int next_len;
+
+		abut_ex = ex + 1;
+		next_lblk = le32_to_cpu(abut_ex->ee_block);
+		next_len = ext4_ext_get_actual_len(abut_ex);
+		next_pblk = ext4_ext_pblock(abut_ex);
+		ee_pblk = ext4_ext_pblock(ex);
+
+		/*
+		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+		 * upon those conditions:
+		 * - C1: abut_ex is initialized,
+		 * - C2: abut_ex is logically abutting ex,
+		 * - C3: abut_ex is physically abutting ex,
+		 * - C4: abut_ex can receive the additional blocks without
+		 *   overflowing the (initialized) length limit.
+		 */
+		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
+		    ((map->m_lblk + map_len) == next_lblk) &&		/*C2*/
+		    ((ee_pblk + ee_len) == next_pblk) &&		/*C3*/
+		    (next_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
 			err = ext4_ext_get_access(handle, inode, path + depth);
 			if (err)
 				goto out;
+
+			trace_ext4_ext_convert_to_initialized_fastpath(inode,
+				map, ex, abut_ex);
+
+			/* Shift the start of abut_ex by 'map_len' blocks */
+			abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+			ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+			ex->ee_len = cpu_to_le16(ee_len - map_len);
+			ext4_ext_mark_unwritten(ex); /* Restore the flag */
+
+			/* Extend abut_ex by 'map_len' blocks */
+			abut_ex->ee_len = cpu_to_le16(next_len + map_len);
+
+			/* Result: number of initialized blocks past m_lblk */
+			allocated = map_len;
 		}
-		allocated = max_blocks;
 	}
+	if (allocated) {
+		/* Mark the block containing both extents as dirty */
+		ext4_ext_dirty(handle, inode, path + depth);
+
+		/* Update path to point to the right extent */
+		path[depth].p_ext = abut_ex;
+		goto out;
+	} else
+		allocated = ee_len - (map->m_lblk - ee_block);
+
+	WARN_ON(map->m_lblk < ee_block);
 	/*
-	 * If there was a change of depth as part of the
-	 * insertion of ex3 above, we need to update the length
-	 * of the ex1 extent again here
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully inside i_size or new_size.
 	 */
-	if (ex1 && ex1 != ex) {
-		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
-		ext4_ext_mark_uninitialized(ex1);
-		ex2 = &newex;
-	}
-	/* ex2: iblock to iblock + maxblocks-1 : initialised */
-	ex2->ee_block = cpu_to_le32(iblock);
-	ext4_ext_store_pblock(ex2, newblock);
-	ex2->ee_len = cpu_to_le16(allocated);
-	if (ex2 != ex)
-		goto insert;
+	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+
+	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+		max_zeroout = sbi->s_extent_max_zeroout_kb >>
+			(inode->i_sb->s_blocksize_bits - 10);
+
+	/* If extent is less than s_max_zeroout_kb, zeroout directly */
+	if (max_zeroout && (ee_len <= max_zeroout)) {
+		err = ext4_ext_zeroout(inode, ex);
+		if (err)
+			goto out;
+		zero_ex.ee_block = ex->ee_block;
+		zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex));
+		ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex));
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
+		ext4_ext_mark_initialized(ex);
+		ext4_ext_try_to_merge(handle, inode, path, ex);
+		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+		goto out;
+	}
+
 	/*
-	 * New (initialized) extent starts from the first block
-	 * in the current extent. i.e., ex2 == ex
-	 * We have to see if it can be merged with the extent
-	 * on the left.
+	 * four cases:
+	 * 1. split the extent into three extents.
+	 * 2. split the extent into two extents, zeroout the first half.
+	 * 3. split the extent into two extents, zeroout the second half.
+	 * 4. split the extent into two extents with out zeroout.
 	 */
-	if (ex2 > EXT_FIRST_EXTENT(eh)) {
-		/*
-		 * To merge left, pass "ex2 - 1" to try_to_merge(),
-		 * since it merges towards right _only_.
-		 */
-		ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
-		if (ret) {
-			err = ext4_ext_correct_indexes(handle, inode, path);
+	split_map.m_lblk = map->m_lblk;
+	split_map.m_len = map->m_len;
+
+	if (max_zeroout && (allocated > map->m_len)) {
+		if (allocated <= max_zeroout) {
+			/* case 3 */
+			zero_ex.ee_block =
+					 cpu_to_le32(map->m_lblk);
+			zero_ex.ee_len = cpu_to_le16(allocated);
+			ext4_ext_store_pblock(&zero_ex,
+				ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+			err = ext4_ext_zeroout(inode, &zero_ex);
 			if (err)
 				goto out;
-			depth = ext_depth(inode);
-			ex2--;
+			split_map.m_lblk = map->m_lblk;
+			split_map.m_len = allocated;
+		} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
+			/* case 2 */
+			if (map->m_lblk != ee_block) {
+				zero_ex.ee_block = ex->ee_block;
+				zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+							ee_block);
+				ext4_ext_store_pblock(&zero_ex,
+						      ext4_ext_pblock(ex));
+				err = ext4_ext_zeroout(inode, &zero_ex);
+				if (err)
+					goto out;
+			}
+
+			split_map.m_lblk = ee_block;
+			split_map.m_len = map->m_lblk - ee_block + map->m_len;
+			allocated = map->m_len;
 		}
 	}
+
+	allocated = ext4_split_extent(handle, inode, path,
+				      &split_map, split_flag, flags);
+	if (allocated < 0)
+		err = allocated;
+
+out:
+	/* If we have gotten a failure, don't zero out status tree */
+	if (!err)
+		err = ext4_zeroout_es(inode, &zero_ex);
+	return err ? err : allocated;
+}
+
+/*
+ * This function is called by ext4_ext_map_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an unwritten extent.
+ *
+ * Writing to an unwritten extent may result in splitting the unwritten
+ * extent into multiple initialized/unwritten extents (up to three)
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be unwritten
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unwritten extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the unwritten extent before DIO submit
+ * the IO. The unwritten extent called at this time will be split
+ * into three unwritten extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ *
+ * Returns the size of unwritten extent to be written on success.
+ */
+static int ext4_split_convert_extents(handle_t *handle,
+					struct inode *inode,
+					struct ext4_map_blocks *map,
+					struct ext4_ext_path *path,
+					int flags)
+{
+	ext4_lblk_t eof_block;
+	ext4_lblk_t ee_block;
+	struct ext4_extent *ex;
+	unsigned int ee_len;
+	int split_flag = 0, depth;
+
+	ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+		  __func__, inode->i_ino,
+		  (unsigned long long)map->m_lblk, map->m_len);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
 	/*
-	 * Try to Merge towards right. This might be required
-	 * only when the whole extent is being written to.
-	 * i.e. ex2 == ex and ex3 == NULL.
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
 	 */
-	if (!ex3) {
-		ret = ext4_ext_try_to_merge(inode, path, ex2);
-		if (ret) {
-			err = ext4_ext_correct_indexes(handle, inode, path);
-			if (err)
-				goto out;
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+
+	/* Convert to unwritten */
+	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+		split_flag |= EXT4_EXT_DATA_VALID1;
+	/* Convert to initialized */
+	} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+		split_flag |= ee_block + ee_len <= eof_block ?
+			      EXT4_EXT_MAY_ZEROOUT : 0;
+		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
+	}
+	flags |= EXT4_GET_BLOCKS_PRE_IO;
+	return ext4_split_extent(handle, inode, path, map, split_flag, flags);
+}
+
+static int ext4_convert_initialized_extents(handle_t *handle,
+					    struct inode *inode,
+					    struct ext4_map_blocks *map,
+					    struct ext4_ext_path *path)
+{
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	unsigned int ee_len;
+	int depth;
+	int err = 0;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+
+	ext_debug("%s: inode %lu, logical"
+		"block %llu, max_blocks %u\n", __func__, inode->i_ino,
+		  (unsigned long long)ee_block, ee_len);
+
+	if (ee_block != map->m_lblk || ee_len > map->m_len) {
+		err = ext4_split_convert_extents(handle, inode, map, path,
+				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+		if (err < 0)
+			goto out;
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
+		}
+		depth = ext_depth(inode);
+		ex = path[depth].p_ext;
+		if (!ex) {
+			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+					 (unsigned long) map->m_lblk);
+			err = -EIO;
+			goto out;
+		}
+	}
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/* first mark the extent as unwritten */
+	ext4_ext_mark_unwritten(ex);
+
+	/* note: ext4_ext_correct_indexes() isn't needed here because
+	 * borders are not changed
+	 */
+	ext4_ext_try_to_merge(handle, inode, path, ex);
+
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+}
+
+
+static int ext4_convert_unwritten_extents_endio(handle_t *handle,
+						struct inode *inode,
+						struct ext4_map_blocks *map,
+						struct ext4_ext_path *path)
+{
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	unsigned int ee_len;
+	int depth;
+	int err = 0;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+
+	ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		  (unsigned long long)ee_block, ee_len);
+
+	/* If extent is larger than requested it is a clear sign that we still
+	 * have some extent state machine issues left. So extent_split is still
+	 * required.
+	 * TODO: Once all related issues will be fixed this situation should be
+	 * illegal.
+	 */
+	if (ee_block != map->m_lblk || ee_len > map->m_len) {
+#ifdef EXT4_DEBUG
+		ext4_warning("Inode (%ld) finished: extent logical block %llu,"
+			     " len %u; IO logical block %llu, len %u\n",
+			     inode->i_ino, (unsigned long long)ee_block, ee_len,
+			     (unsigned long long)map->m_lblk, map->m_len);
+#endif
+		err = ext4_split_convert_extents(handle, inode, map, path,
+						 EXT4_GET_BLOCKS_CONVERT);
+		if (err < 0)
+			goto out;
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
 		}
+		depth = ext_depth(inode);
+		ex = path[depth].p_ext;
 	}
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/* first mark the extent as initialized */
+	ext4_ext_mark_initialized(ex);
+
+	/* note: ext4_ext_correct_indexes() isn't needed here because
+	 * borders are not changed
+	 */
+	ext4_ext_try_to_merge(handle, inode, path, ex);
+
 	/* Mark modified extent as dirty */
-	err = ext4_ext_dirty(handle, inode, path + depth);
-	goto out;
-insert:
-	err = ext4_ext_insert_extent(handle, inode, path, &newex);
+	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+}
+
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+			sector_t block, int count)
+{
+	int i;
+	for (i = 0; i < count; i++)
+                unmap_underlying_metadata(bdev, block + i);
+}
+
+/*
+ * Handle EOFBLOCKS_FL flag, clearing it if necessary
+ */
+static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
+			      ext4_lblk_t lblk,
+			      struct ext4_ext_path *path,
+			      unsigned int len)
+{
+	int i, depth;
+	struct ext4_extent_header *eh;
+	struct ext4_extent *last_ex;
+
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+		return 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+
+	/*
+	 * We're going to remove EOFBLOCKS_FL entirely in future so we
+	 * do not care for this case anymore. Simply remove the flag
+	 * if there are no extents.
+	 */
+	if (unlikely(!eh->eh_entries))
+		goto out;
+	last_ex = EXT_LAST_EXTENT(eh);
+	/*
+	 * We should clear the EOFBLOCKS_FL flag if we are writing the
+	 * last block in the last extent in the file.  We test this by
+	 * first checking to see if the caller to
+	 * ext4_ext_get_blocks() was interested in the last block (or
+	 * a block beyond the last block) in the current extent.  If
+	 * this turns out to be false, we can bail out from this
+	 * function immediately.
+	 */
+	if (lblk + len < le32_to_cpu(last_ex->ee_block) +
+	    ext4_ext_get_actual_len(last_ex))
+		return 0;
+	/*
+	 * If the caller does appear to be planning to write at or
+	 * beyond the end of the current extent, we then test to see
+	 * if the current extent is the last extent in the file, by
+	 * checking to make sure it was reached via the rightmost node
+	 * at each level of the tree.
+	 */
+	for (i = depth-1; i >= 0; i--)
+		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+			return 0;
 out:
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	return ext4_mark_inode_dirty(handle, inode);
+}
+
+/**
+ * ext4_find_delalloc_range: find delayed allocated block in the given range.
+ *
+ * Return 1 if there is a delalloc block in the range, otherwise 0.
+ */
+int ext4_find_delalloc_range(struct inode *inode,
+			     ext4_lblk_t lblk_start,
+			     ext4_lblk_t lblk_end)
+{
+	struct extent_status es;
+
+	ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
+	if (es.es_len == 0)
+		return 0; /* there is no delay extent in this tree */
+	else if (es.es_lblk <= lblk_start &&
+		 lblk_start < es.es_lblk + es.es_len)
+		return 1;
+	else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
+		return 1;
+	else
+		return 0;
+}
+
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t lblk_start, lblk_end;
+	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
+	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
+
+	return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
+}
+
+/**
+ * Determines how many complete clusters (out of those specified by the 'map')
+ * are under delalloc and were reserved quota for.
+ * This function is called when we are writing out the blocks that were
+ * originally written with their allocation delayed, but then the space was
+ * allocated using fallocate() before the delayed allocation could be resolved.
+ * The cases to look for are:
+ * ('=' indicated delayed allocated blocks
+ *  '-' indicates non-delayed allocated blocks)
+ * (a) partial clusters towards beginning and/or end outside of allocated range
+ *     are not delalloc'ed.
+ *	Ex:
+ *	|----c---=|====c====|====c====|===-c----|
+ *	         |++++++ allocated ++++++|
+ *	==> 4 complete clusters in above example
+ *
+ * (b) partial cluster (outside of allocated range) towards either end is
+ *     marked for delayed allocation. In this case, we will exclude that
+ *     cluster.
+ *	Ex:
+ *	|----====c========|========c========|
+ *	     |++++++ allocated ++++++|
+ *	==> 1 complete clusters in above example
+ *
+ *	Ex:
+ *	|================c================|
+ *            |++++++ allocated ++++++|
+ *	==> 0 complete clusters in above example
+ *
+ * The ext4_da_update_reserve_space will be called only if we
+ * determine here that there were some "entire" clusters that span
+ * this 'allocated' range.
+ * In the non-bigalloc case, this function will just end up returning num_blks
+ * without ever calling ext4_find_delalloc_range.
+ */
+static unsigned int
+get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
+			   unsigned int num_blks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
+	ext4_lblk_t lblk_from, lblk_to, c_offset;
+	unsigned int allocated_clusters = 0;
+
+	alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
+	alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
+
+	/* max possible clusters for this allocation */
+	allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
+
+	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
+
+	/* Check towards left side */
+	c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
+	if (c_offset) {
+		lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
+		lblk_to = lblk_from + c_offset - 1;
+
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+			allocated_clusters--;
+	}
+
+	/* Now check towards right. */
+	c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
+	if (allocated_clusters && c_offset) {
+		lblk_from = lblk_start + num_blks;
+		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
+
+		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
+			allocated_clusters--;
+	}
+
+	return allocated_clusters;
+}
+
+static int
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map,
+			struct ext4_ext_path *path, int flags,
+			unsigned int allocated, ext4_fsblk_t newblock)
+{
+	int ret = 0;
+	int err = 0;
+
+	/*
+	 * Make sure that the extent is no bigger than we support with
+	 * unwritten extent
+	 */
+	if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
+		map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
+
+	ret = ext4_convert_initialized_extents(handle, inode, map,
+						path);
+	if (ret >= 0) {
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+		err = check_eofblocks_fl(handle, inode, map->m_lblk,
+					 path, map->m_len);
+	} else
+		err = ret;
+	map->m_flags |= EXT4_MAP_UNWRITTEN;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	map->m_len = allocated;
+
+	return err ? err : allocated;
+}
+
+static int
+ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map,
+			struct ext4_ext_path *path, int flags,
+			unsigned int allocated, ext4_fsblk_t newblock)
+{
+	int ret = 0;
+	int err = 0;
+	ext4_io_end_t *io = ext4_inode_aio(inode);
+
+	ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
+		  "block %llu, max_blocks %u, flags %x, allocated %u\n",
+		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
+		  flags, allocated);
+	ext4_ext_show_leaf(inode, path);
+
+	/*
+	 * When writing into unwritten space, we should not fail to
+	 * allocate metadata blocks for the new extent block if needed.
+	 */
+	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
+
+	trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
+						    allocated, newblock);
+
+	/* get_block() before submit the IO, split the extent */
+	if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+		ret = ext4_split_convert_extents(handle, inode, map,
+					 path, flags | EXT4_GET_BLOCKS_CONVERT);
+		if (ret <= 0)
+			goto out;
+		/*
+		 * Flag the inode(non aio case) or end_io struct (aio case)
+		 * that this IO needs to conversion to written when IO is
+		 * completed
+		 */
+		if (io)
+			ext4_set_io_unwritten_flag(inode, io);
+		else
+			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
+		goto out;
+	}
+	/* IO end_io complete, convert the filled extent to written */
+	if (flags & EXT4_GET_BLOCKS_CONVERT) {
+		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
+							path);
+		if (ret >= 0) {
+			ext4_update_inode_fsync_trans(handle, inode, 1);
+			err = check_eofblocks_fl(handle, inode, map->m_lblk,
+						 path, map->m_len);
+		} else
+			err = ret;
+		map->m_flags |= EXT4_MAP_MAPPED;
+		map->m_pblk = newblock;
+		if (allocated > map->m_len)
+			allocated = map->m_len;
+		map->m_len = allocated;
+		goto out2;
+	}
+	/* buffered IO case */
+	/*
+	 * repeat fallocate creation request
+	 * we already have an unwritten extent
+	 */
+	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
+		goto map_out;
+	}
+
+	/* buffered READ or buffered write_begin() lookup */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+		/*
+		 * We have blocks reserved already.  We
+		 * return allocated blocks so that delalloc
+		 * won't do block reservation for us.  But
+		 * the buffer head will be unmapped so that
+		 * a read from the block returns 0s.
+		 */
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
+		goto out1;
+	}
+
+	/* buffered write, writepage time, convert*/
+	ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
+	if (ret >= 0)
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+out:
+	if (ret <= 0) {
+		err = ret;
+		goto out2;
+	} else
+		allocated = ret;
+	map->m_flags |= EXT4_MAP_NEW;
+	/*
+	 * if we allocated more blocks than requested
+	 * we need to make sure we unmap the extra block
+	 * allocated. The actual needed block will get
+	 * unmapped later when we find the buffer_head marked
+	 * new.
+	 */
+	if (allocated > map->m_len) {
+		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+					newblock + map->m_len,
+					allocated - map->m_len);
+		allocated = map->m_len;
+	}
+	map->m_len = allocated;
+
+	/*
+	 * If we have done fallocate with the offset that is already
+	 * delayed allocated, we would have block reservation
+	 * and quota reservation done in the delayed write path.
+	 * But fallocate would have already updated quota and block
+	 * count for this offset. So cancel these reservation
+	 */
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		unsigned int reserved_clusters;
+		reserved_clusters = get_reserved_cluster_alloc(inode,
+				map->m_lblk, map->m_len);
+		if (reserved_clusters)
+			ext4_da_update_reserve_space(inode,
+						     reserved_clusters,
+						     0);
+	}
+
+map_out:
+	map->m_flags |= EXT4_MAP_MAPPED;
+	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
+		err = check_eofblocks_fl(handle, inode, map->m_lblk, path,
+					 map->m_len);
+		if (err < 0)
+			goto out2;
+	}
+out1:
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	ext4_ext_show_leaf(inode, path);
+	map->m_pblk = newblock;
+	map->m_len = allocated;
+out2:
 	return err ? err : allocated;
 }
 
 /*
+ * get_implied_cluster_alloc - check to see if the requested
+ * allocation (in the map structure) overlaps with a cluster already
+ * allocated in an extent.
+ *	@sb	The filesystem superblock structure
+ *	@map	The requested lblk->pblk mapping
+ *	@ex	The extent structure which might contain an implied
+ *			cluster allocation
+ *
+ * This function is called by ext4_ext_map_blocks() after we failed to
+ * find blocks that were already in the inode's extent tree.  Hence,
+ * we know that the beginning of the requested region cannot overlap
+ * the extent from the inode's extent tree.  There are three cases we
+ * want to catch.  The first is this case:
+ *
+ *		 |--- cluster # N--|
+ *    |--- extent ---|	|---- requested region ---|
+ *			|==========|
+ *
+ * The second case that we need to test for is this one:
+ *
+ *   |--------- cluster # N ----------------|
+ *	   |--- requested region --|   |------- extent ----|
+ *	   |=======================|
+ *
+ * The third case is when the requested region lies between two extents
+ * within the same cluster:
+ *          |------------- cluster # N-------------|
+ * |----- ex -----|                  |---- ex_right ----|
+ *                  |------ requested region ------|
+ *                  |================|
+ *
+ * In each of the above cases, we need to set the map->m_pblk and
+ * map->m_len so it corresponds to the return the extent labelled as
+ * "|====|" from cluster #N, since it is already in use for data in
+ * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
+ * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
+ * as a new "allocated" block region.  Otherwise, we will return 0 and
+ * ext4_ext_map_blocks() will then allocate one or more new clusters
+ * by calling ext4_mb_new_blocks().
+ */
+static int get_implied_cluster_alloc(struct super_block *sb,
+				     struct ext4_map_blocks *map,
+				     struct ext4_extent *ex,
+				     struct ext4_ext_path *path)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+	ext4_lblk_t ex_cluster_start, ex_cluster_end;
+	ext4_lblk_t rr_cluster_start;
+	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
+	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
+	unsigned short ee_len = ext4_ext_get_actual_len(ex);
+
+	/* The extent passed in that we are trying to match */
+	ex_cluster_start = EXT4_B2C(sbi, ee_block);
+	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
+
+	/* The requested region passed into ext4_map_blocks() */
+	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
+
+	if ((rr_cluster_start == ex_cluster_end) ||
+	    (rr_cluster_start == ex_cluster_start)) {
+		if (rr_cluster_start == ex_cluster_end)
+			ee_start += ee_len - 1;
+		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
+		map->m_len = min(map->m_len,
+				 (unsigned) sbi->s_cluster_ratio - c_offset);
+		/*
+		 * Check for and handle this case:
+		 *
+		 *   |--------- cluster # N-------------|
+		 *		       |------- extent ----|
+		 *	   |--- requested region ---|
+		 *	   |===========|
+		 */
+
+		if (map->m_lblk < ee_block)
+			map->m_len = min(map->m_len, ee_block - map->m_lblk);
+
+		/*
+		 * Check for the case where there is already another allocated
+		 * block to the right of 'ex' but before the end of the cluster.
+		 *
+		 *          |------------- cluster # N-------------|
+		 * |----- ex -----|                  |---- ex_right ----|
+		 *                  |------ requested region ------|
+		 *                  |================|
+		 */
+		if (map->m_lblk > ee_block) {
+			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
+			map->m_len = min(map->m_len, next - map->m_lblk);
+		}
+
+		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
+		return 1;
+	}
+
+	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
+	return 0;
+}
+
+
+/*
  * Block allocation/map/preallocation routine for extents based files
  *
  *
@@ -2304,51 +4258,27 @@ out:
  *
  * return < 0, error case.
  */
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			ext4_lblk_t iblock,
-			unsigned long max_blocks, struct buffer_head *bh_result,
-			int create, int extend_disksize)
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map, int flags)
 {
 	struct ext4_ext_path *path = NULL;
-	struct ext4_extent_header *eh;
-	struct ext4_extent newex, *ex;
-	ext4_fsblk_t goal, newblock;
-	int err = 0, depth, ret;
-	unsigned long allocated = 0;
+	struct ext4_extent newex, *ex, *ex2;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	ext4_fsblk_t newblock = 0;
+	int free_on_err = 0, err = 0, depth, ret;
+	unsigned int allocated = 0, offset = 0;
+	unsigned int allocated_clusters = 0;
 	struct ext4_allocation_request ar;
+	ext4_io_end_t *io = ext4_inode_aio(inode);
+	ext4_lblk_t cluster_offset;
+	int set_unwritten = 0;
 
-	__clear_bit(BH_New, &bh_result->b_state);
-	ext_debug("blocks %u/%lu requested for inode %u\n",
-			iblock, max_blocks, inode->i_ino);
-
-	/* check in cache */
-	goal = ext4_ext_in_cache(inode, iblock, &newex);
-	if (goal) {
-		if (goal == EXT4_EXT_CACHE_GAP) {
-			if (!create) {
-				/*
-				 * block isn't allocated yet and
-				 * user doesn't want to allocate it
-				 */
-				goto out2;
-			}
-			/* we should allocate requested block */
-		} else if (goal == EXT4_EXT_CACHE_EXTENT) {
-			/* block is already allocated */
-			newblock = iblock
-				   - le32_to_cpu(newex.ee_block)
-				   + ext_pblock(&newex);
-			/* number of remaining blocks in the extent */
-			allocated = ext4_ext_get_actual_len(&newex) -
-					(iblock - le32_to_cpu(newex.ee_block));
-			goto out;
-		} else {
-			BUG();
-		}
-	}
+	ext_debug("blocks %u/%u requested for inode %lu\n",
+		  map->m_lblk, map->m_len, inode->i_ino);
+	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 
 	/* find extent for this block */
-	path = ext4_ext_find_extent(inode, iblock, NULL);
+	path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0);
 	if (IS_ERR(path)) {
 		err = PTR_ERR(path);
 		path = NULL;
@@ -2362,201 +4292,354 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * this situation is possible, though, _during_ tree modification;
 	 * this is why assert can't be put in ext4_ext_find_extent()
 	 */
-	BUG_ON(path[depth].p_ext == NULL && depth != 0);
-	eh = path[depth].p_hdr;
+	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
+		EXT4_ERROR_INODE(inode, "bad extent address "
+				 "lblock: %lu, depth: %d pblock %lld",
+				 (unsigned long) map->m_lblk, depth,
+				 path[depth].p_block);
+		err = -EIO;
+		goto out2;
+	}
 
 	ex = path[depth].p_ext;
 	if (ex) {
 		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
-		ext4_fsblk_t ee_start = ext_pblock(ex);
+		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
 		unsigned short ee_len;
 
+
 		/*
-		 * Uninitialized extents are treated as holes, except that
+		 * unwritten extents are treated as holes, except that
 		 * we split out initialized portions during a write.
 		 */
 		ee_len = ext4_ext_get_actual_len(ex);
+
+		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
+
 		/* if found extent covers block, simply return it */
-		if (iblock >= ee_block && iblock < ee_block + ee_len) {
-			newblock = iblock - ee_block + ee_start;
+		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			newblock = map->m_lblk - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
-			allocated = ee_len - (iblock - ee_block);
-			ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
-					ee_block, ee_len, newblock);
-
-			/* Do not put uninitialized extent in the cache */
-			if (!ext4_ext_is_uninitialized(ex)) {
-				ext4_ext_put_in_cache(inode, ee_block,
-							ee_len, ee_start,
-							EXT4_EXT_CACHE_EXTENT);
-				goto out;
-			}
-			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
-				goto out;
-			if (!create)
+			allocated = ee_len - (map->m_lblk - ee_block);
+			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+				  ee_block, ee_len, newblock);
+
+			/*
+			 * If the extent is initialized check whether the
+			 * caller wants to convert it to unwritten.
+			 */
+			if ((!ext4_ext_is_unwritten(ex)) &&
+			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+				allocated = ext4_ext_convert_initialized_extent(
+						handle, inode, map, path, flags,
+						allocated, newblock);
 				goto out2;
+			} else if (!ext4_ext_is_unwritten(ex))
+				goto out;
 
-			ret = ext4_ext_convert_to_initialized(handle, inode,
-								path, iblock,
-								max_blocks);
-			if (ret <= 0) {
+			ret = ext4_ext_handle_unwritten_extents(
+				handle, inode, map, path, flags,
+				allocated, newblock);
+			if (ret < 0)
 				err = ret;
-				goto out2;
-			} else
+			else
 				allocated = ret;
-			goto outnew;
+			goto out2;
 		}
 	}
 
+	if ((sbi->s_cluster_ratio > 1) &&
+	    ext4_find_delalloc_cluster(inode, map->m_lblk))
+		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+
 	/*
 	 * requested block isn't allocated yet;
 	 * we couldn't try to create block if create flag is zero
 	 */
-	if (!create) {
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 		/*
 		 * put just found gap into cache to speed up
 		 * subsequent requests
 		 */
-		ext4_ext_put_gap_in_cache(inode, path, iblock);
+		if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0)
+			ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
 		goto out2;
 	}
+
 	/*
-	 * Okay, we need to do block allocation.  Lazily initialize the block
-	 * allocation info here if necessary.
+	 * Okay, we need to do block allocation.
 	 */
-	if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
-		ext4_init_block_alloc_info(inode);
+	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+	newex.ee_block = cpu_to_le32(map->m_lblk);
+	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+
+	/*
+	 * If we are doing bigalloc, check to see if the extent returned
+	 * by ext4_ext_find_extent() implies a cluster we can use.
+	 */
+	if (cluster_offset && ex &&
+	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
+		ar.len = allocated = map->m_len;
+		newblock = map->m_pblk;
+		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+		goto got_allocated_blocks;
+	}
 
 	/* find neighbour allocated blocks */
-	ar.lleft = iblock;
+	ar.lleft = map->m_lblk;
 	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
 	if (err)
 		goto out2;
-	ar.lright = iblock;
-	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
+	ar.lright = map->m_lblk;
+	ex2 = NULL;
+	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
 	if (err)
 		goto out2;
 
+	/* Check if the extent after searching to the right implies a
+	 * cluster we can use. */
+	if ((sbi->s_cluster_ratio > 1) && ex2 &&
+	    get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
+		ar.len = allocated = map->m_len;
+		newblock = map->m_pblk;
+		map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+		goto got_allocated_blocks;
+	}
+
 	/*
 	 * See if request is beyond maximum number of blocks we can have in
 	 * a single extent. For an initialized extent this limit is
-	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
-	 * EXT_UNINIT_MAX_LEN.
+	 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
+	 * EXT_UNWRITTEN_MAX_LEN.
 	 */
-	if (max_blocks > EXT_INIT_MAX_LEN &&
-	    create != EXT4_CREATE_UNINITIALIZED_EXT)
-		max_blocks = EXT_INIT_MAX_LEN;
-	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
-		 create == EXT4_CREATE_UNINITIALIZED_EXT)
-		max_blocks = EXT_UNINIT_MAX_LEN;
-
-	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
-	newex.ee_block = cpu_to_le32(iblock);
-	newex.ee_len = cpu_to_le16(max_blocks);
-	err = ext4_ext_check_overlap(inode, &newex, path);
+	if (map->m_len > EXT_INIT_MAX_LEN &&
+	    !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+		map->m_len = EXT_INIT_MAX_LEN;
+	else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
+		 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+		map->m_len = EXT_UNWRITTEN_MAX_LEN;
+
+	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+	newex.ee_len = cpu_to_le16(map->m_len);
+	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
 	if (err)
 		allocated = ext4_ext_get_actual_len(&newex);
 	else
-		allocated = max_blocks;
+		allocated = map->m_len;
 
 	/* allocate new block */
 	ar.inode = inode;
-	ar.goal = ext4_ext_find_goal(inode, path, iblock);
-	ar.logical = iblock;
-	ar.len = allocated;
+	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+	ar.logical = map->m_lblk;
+	/*
+	 * We calculate the offset from the beginning of the cluster
+	 * for the logical block number, since when we allocate a
+	 * physical cluster, the physical block should start at the
+	 * same offset from the beginning of the cluster.  This is
+	 * needed so that future calls to get_implied_cluster_alloc()
+	 * work correctly.
+	 */
+	offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
+	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
+	ar.goal -= offset;
+	ar.logical -= offset;
 	if (S_ISREG(inode->i_mode))
 		ar.flags = EXT4_MB_HINT_DATA;
 	else
 		/* disable in-core preallocation for non-regular files */
 		ar.flags = 0;
+	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
 	newblock = ext4_mb_new_blocks(handle, &ar, &err);
 	if (!newblock)
 		goto out2;
-	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
-			goal, newblock, allocated);
-
+	ext_debug("allocate new block: goal %llu, found %llu/%u\n",
+		  ar.goal, newblock, allocated);
+	free_on_err = 1;
+	allocated_clusters = ar.len;
+	ar.len = EXT4_C2B(sbi, ar.len) - offset;
+	if (ar.len > allocated)
+		ar.len = allocated;
+
+got_allocated_blocks:
 	/* try to insert new extent into found leaf and return */
-	ext4_ext_store_pblock(&newex, newblock);
+	ext4_ext_store_pblock(&newex, newblock + offset);
 	newex.ee_len = cpu_to_le16(ar.len);
-	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
-		ext4_ext_mark_uninitialized(&newex);
-	err = ext4_ext_insert_extent(handle, inode, path, &newex);
-	if (err) {
+	/* Mark unwritten */
+	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+		ext4_ext_mark_unwritten(&newex);
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
+		/*
+		 * io_end structure was created for every IO write to an
+		 * unwritten extent. To avoid unnecessary conversion,
+		 * here we flag the IO that really needs the conversion.
+		 * For non asycn direct IO case, flag the inode state
+		 * that we need to perform conversion when IO is done.
+		 */
+		if (flags & EXT4_GET_BLOCKS_PRE_IO)
+			set_unwritten = 1;
+	}
+
+	err = 0;
+	if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0)
+		err = check_eofblocks_fl(handle, inode, map->m_lblk,
+					 path, ar.len);
+	if (!err)
+		err = ext4_ext_insert_extent(handle, inode, path,
+					     &newex, flags);
+
+	if (!err && set_unwritten) {
+		if (io)
+			ext4_set_io_unwritten_flag(inode, io);
+		else
+			ext4_set_inode_state(inode,
+					     EXT4_STATE_DIO_UNWRITTEN);
+	}
+
+	if (err && free_on_err) {
+		int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
+			EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
 		/* free data blocks we just allocated */
 		/* not a good idea to call discard here directly,
 		 * but otherwise we'd need to call it every free() */
-		ext4_mb_discard_inode_preallocations(inode);
-		ext4_free_blocks(handle, inode, ext_pblock(&newex),
-					ext4_ext_get_actual_len(&newex), 0);
+		ext4_discard_preallocations(inode);
+		ext4_free_blocks(handle, inode, NULL, newblock,
+				 EXT4_C2B(sbi, allocated_clusters), fb_flags);
 		goto out2;
 	}
 
-	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = inode->i_size;
-
 	/* previous routine could use block we allocated */
-	newblock = ext_pblock(&newex);
+	newblock = ext4_ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
-outnew:
-	__set_bit(BH_New, &bh_result->b_state);
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	map->m_flags |= EXT4_MAP_NEW;
+
+	/*
+	 * Update reserved blocks/metadata blocks after successful
+	 * block allocation which had been deferred till now.
+	 */
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+		unsigned int reserved_clusters;
+		/*
+		 * Check how many clusters we had reserved this allocated range
+		 */
+		reserved_clusters = get_reserved_cluster_alloc(inode,
+						map->m_lblk, allocated);
+		if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+			if (reserved_clusters) {
+				/*
+				 * We have clusters reserved for this range.
+				 * But since we are not doing actual allocation
+				 * and are simply using blocks from previously
+				 * allocated cluster, we should release the
+				 * reservation and not claim quota.
+				 */
+				ext4_da_update_reserve_space(inode,
+						reserved_clusters, 0);
+			}
+		} else {
+			BUG_ON(allocated_clusters < reserved_clusters);
+			if (reserved_clusters < allocated_clusters) {
+				struct ext4_inode_info *ei = EXT4_I(inode);
+				int reservation = allocated_clusters -
+						  reserved_clusters;
+				/*
+				 * It seems we claimed few clusters outside of
+				 * the range of this allocation. We should give
+				 * it back to the reservation pool. This can
+				 * happen in the following case:
+				 *
+				 * * Suppose s_cluster_ratio is 4 (i.e., each
+				 *   cluster has 4 blocks. Thus, the clusters
+				 *   are [0-3],[4-7],[8-11]...
+				 * * First comes delayed allocation write for
+				 *   logical blocks 10 & 11. Since there were no
+				 *   previous delayed allocated blocks in the
+				 *   range [8-11], we would reserve 1 cluster
+				 *   for this write.
+				 * * Next comes write for logical blocks 3 to 8.
+				 *   In this case, we will reserve 2 clusters
+				 *   (for [0-3] and [4-7]; and not for [8-11] as
+				 *   that range has a delayed allocated blocks.
+				 *   Thus total reserved clusters now becomes 3.
+				 * * Now, during the delayed allocation writeout
+				 *   time, we will first write blocks [3-8] and
+				 *   allocate 3 clusters for writing these
+				 *   blocks. Also, we would claim all these
+				 *   three clusters above.
+				 * * Now when we come here to writeout the
+				 *   blocks [10-11], we would expect to claim
+				 *   the reservation of 1 cluster we had made
+				 *   (and we would claim it since there are no
+				 *   more delayed allocated blocks in the range
+				 *   [8-11]. But our reserved cluster count had
+				 *   already gone to 0.
+				 *
+				 *   Thus, at the step 4 above when we determine
+				 *   that there are still some unwritten delayed
+				 *   allocated blocks outside of our current
+				 *   block range, we should increment the
+				 *   reserved clusters count so that when the
+				 *   remaining blocks finally gets written, we
+				 *   could claim them.
+				 */
+				dquot_reserve_block(inode,
+						EXT4_C2B(sbi, reservation));
+				spin_lock(&ei->i_block_reservation_lock);
+				ei->i_reserved_data_blocks += reservation;
+				spin_unlock(&ei->i_block_reservation_lock);
+			}
+			/*
+			 * We will claim quota for all newly allocated blocks.
+			 * We're updating the reserved space *after* the
+			 * correction above so we do not accidentally free
+			 * all the metadata reservation because we might
+			 * actually need it later on.
+			 */
+			ext4_da_update_reserve_space(inode, allocated_clusters,
+							1);
+		}
+	}
 
-	/* Cache only when it is _not_ an uninitialized extent */
-	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
-		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
-						EXT4_EXT_CACHE_EXTENT);
+	/*
+	 * Cache the extent and update transaction to commit on fdatasync only
+	 * when it is _not_ an unwritten extent.
+	 */
+	if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+	else
+		ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
-	if (allocated > max_blocks)
-		allocated = max_blocks;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
 	ext4_ext_show_leaf(inode, path);
-	__set_bit(BH_Mapped, &bh_result->b_state);
-	bh_result->b_bdev = inode->i_sb->s_bdev;
-	bh_result->b_blocknr = newblock;
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = newblock;
+	map->m_len = allocated;
 out2:
 	if (path) {
 		ext4_ext_drop_refs(path);
 		kfree(path);
 	}
+
+	trace_ext4_ext_map_blocks_exit(inode, flags, map,
+				       err ? err : allocated);
+	ext4_es_lru_add(inode);
 	return err ? err : allocated;
 }
 
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(handle_t *handle, struct inode *inode)
 {
-	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t last_block;
-	handle_t *handle;
 	int err = 0;
 
 	/*
-	 * probably first extent we're gonna free will be last in block
-	 */
-	err = ext4_writepage_trans_blocks(inode) + 3;
-	handle = ext4_journal_start(inode, err);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
-		return;
-	}
-
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
-
-	down_write(&EXT4_I(inode)->i_data_sem);
-	ext4_ext_invalidate_cache(inode);
-
-	ext4_mb_discard_inode_preallocations(inode);
-
-	/*
 	 * TODO: optimization is possible here.
 	 * Probably we need not scan at all,
 	 * because page truncation is enough.
 	 */
-	if (ext4_orphan_add(handle, inode))
-		goto out_stop;
 
 	/* we have to know where to truncate from in crash case */
 	EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2564,170 +4647,866 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 
 	last_block = (inode->i_size + sb->s_blocksize - 1)
 			>> EXT4_BLOCK_SIZE_BITS(sb);
-	err = ext4_ext_remove_space(inode, last_block);
+retry:
+	err = ext4_es_remove_extent(inode, last_block,
+				    EXT_MAX_BLOCKS - last_block);
+	if (err == -ENOMEM) {
+		cond_resched();
+		congestion_wait(BLK_RW_ASYNC, HZ/50);
+		goto retry;
+	}
+	if (err) {
+		ext4_std_error(inode->i_sb, err);
+		return;
+	}
+	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
+	ext4_std_error(inode->i_sb, err);
+}
 
-	/* In a multi-transaction truncate, we only make the final
-	 * transaction synchronous.
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+				  ext4_lblk_t len, int flags, int mode)
+{
+	struct inode *inode = file_inode(file);
+	handle_t *handle;
+	int ret = 0;
+	int ret2 = 0;
+	int retries = 0;
+	struct ext4_map_blocks map;
+	unsigned int credits;
+
+	map.m_lblk = offset;
+	/*
+	 * Don't normalize the request if it can fit in one extent so
+	 * that it doesn't get unnecessarily split into multiple
+	 * extents.
 	 */
-	if (IS_SYNC(inode))
-		handle->h_sync = 1;
+	if (len <= EXT_UNWRITTEN_MAX_LEN)
+		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
-out_stop:
 	/*
-	 * If this was a simple ftruncate() and the file will remain alive,
-	 * then we need to clear up the orphan record which we created above.
-	 * However, if this was a real unlink then we were called by
-	 * ext4_delete_inode(), and we allow that function to clean up the
-	 * orphan info for us.
+	 * credits to insert 1 extent into extent tree
 	 */
-	if (inode->i_nlink)
-		ext4_orphan_del(handle, inode);
+	credits = ext4_chunk_trans_blocks(inode, len);
 
-	up_write(&EXT4_I(inode)->i_data_sem);
-	ext4_journal_stop(handle);
+retry:
+	while (ret >= 0 && ret < len) {
+		map.m_lblk = map.m_lblk + ret;
+		map.m_len = len = len - ret;
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+					    credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			break;
+		}
+		ret = ext4_map_blocks(handle, inode, &map, flags);
+		if (ret <= 0) {
+			ext4_debug("inode #%lu: block %u: len %u: "
+				   "ext4_ext_map_blocks returned %d",
+				   inode->i_ino, map.m_lblk,
+				   map.m_len, ret);
+			ext4_mark_inode_dirty(handle, inode);
+			ret2 = ext4_journal_stop(handle);
+			break;
+		}
+		ret2 = ext4_journal_stop(handle);
+		if (ret2)
+			break;
+	}
+	if (ret == -ENOSPC &&
+			ext4_should_retry_alloc(inode->i_sb, &retries)) {
+		ret = 0;
+		goto retry;
+	}
+
+	return ret > 0 ? ret2 : ret;
 }
 
-/*
- * ext4_ext_writepage_trans_blocks:
- * calculate max number of blocks we could modify
- * in order to allocate new block for an inode
- */
-int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
+static long ext4_zero_range(struct file *file, loff_t offset,
+			    loff_t len, int mode)
 {
-	int needed;
+	struct inode *inode = file_inode(file);
+	handle_t *handle = NULL;
+	unsigned int max_blocks;
+	loff_t new_size = 0;
+	int ret = 0;
+	int flags;
+	int partial;
+	loff_t start, end;
+	ext4_lblk_t lblk;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned int blkbits = inode->i_blkbits;
 
-	needed = ext4_ext_calc_credits_for_insert(inode, NULL);
+	trace_ext4_zero_range(inode, offset, len, mode);
 
-	/* caller wants to allocate num blocks, but note it includes sb */
-	needed = needed * num - (num - 1);
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
 
-#ifdef CONFIG_QUOTA
-	needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
+	/* Call ext4_force_commit to flush all data in case of data=journal. */
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
+	 */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		ret = filemap_write_and_wait_range(mapping, offset,
+						   offset + len - 1);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Round up offset. This is not fallocate, we neet to zero out
+	 * blocks, so convert interior block aligned part of the range to
+	 * unwritten and possibly manually zero out unaligned parts of the
+	 * range.
+	 */
+	start = round_up(offset, 1 << blkbits);
+	end = round_down((offset + len), 1 << blkbits);
+
+	if (start < offset || end > offset + len)
+		return -EINVAL;
+	partial = (offset + len) & ((1 << blkbits) - 1);
+
+	lblk = start >> blkbits;
+	max_blocks = (end >> blkbits);
+	if (max_blocks < lblk)
+		max_blocks = 0;
+	else
+		max_blocks -= lblk;
+
+	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
+		EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * Indirect files do not support unwritten extnets
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+		ret = -EOPNOTSUPP;
+		goto out_mutex;
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	     offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto out_mutex;
+		/*
+		 * If we have a partial block after EOF we have to allocate
+		 * the entire block.
+		 */
+		if (partial)
+			max_blocks += 1;
+	}
+
+	if (max_blocks > 0) {
+
+		/* Now release the pages and zero block aligned part of pages*/
+		truncate_pagecache_range(inode, start, end - 1);
+
+		/* Wait all existing dio workers, newcomers will block on i_mutex */
+		ext4_inode_block_unlocked_dio(inode);
+		inode_dio_wait(inode);
+
+		/*
+		 * Remove entire range from the extent status tree.
+		 */
+		ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+		if (ret)
+			goto out_dio;
+
+		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+					     mode);
+		if (ret)
+			goto out_dio;
+	}
+
+	handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		ext4_std_error(inode->i_sb, ret);
+		goto out_dio;
+	}
+
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+	if (new_size) {
+		if (new_size > i_size_read(inode))
+			i_size_write(inode, new_size);
+		if (new_size > EXT4_I(inode)->i_disksize)
+			ext4_update_i_disksize(inode, new_size);
+	} else {
+		/*
+		* Mark that we allocate beyond EOF so the subsequent truncate
+		* can proceed even if the new size is the same as i_size.
+		*/
+		if ((offset + len) > i_size_read(inode))
+			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	}
 
-	return needed;
+	ext4_mark_inode_dirty(handle, inode);
+
+	/* Zero out partial block at the edges of the range */
+	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+	if (file->f_flags & O_SYNC)
+		ext4_handle_sync(handle);
+
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
 }
 
 /*
- * preallocate space for a file. This implements ext4's fallocate inode
+ * preallocate space for a file. This implements ext4's fallocate file
  * operation, which gets called from sys_fallocate system call.
  * For block-mapped files, posix_fallocate should fall back to the method
  * of writing zeroes to the required new blocks (the same behavior which is
  * expected for file systems which do not support fallocate() system call).
  */
-long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
+long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
+	struct inode *inode = file_inode(file);
 	handle_t *handle;
-	ext4_lblk_t block;
-	unsigned long max_blocks;
-	ext4_fsblk_t nblocks = 0;
+	loff_t new_size = 0;
+	unsigned int max_blocks;
 	int ret = 0;
-	int ret2 = 0;
-	int retries = 0;
-	struct buffer_head map_bh;
-	unsigned int credits, blkbits = inode->i_blkbits;
+	int flags;
+	ext4_lblk_t lblk;
+	struct timespec tv;
+	unsigned int blkbits = inode->i_blkbits;
+
+	/* Return error if mode is not supported */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+		return -EOPNOTSUPP;
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return ext4_punch_hole(inode, offset, len);
+
+	ret = ext4_convert_inline_data(inode);
+	if (ret)
+		return ret;
 
 	/*
 	 * currently supporting (pre)allocate mode for extent-based
 	 * files _only_
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
-	/* preallocation to directories is currently not supported */
-	if (S_ISDIR(inode->i_mode))
-		return -ENODEV;
+	if (mode & FALLOC_FL_COLLAPSE_RANGE)
+		return ext4_collapse_range(inode, offset, len);
 
-	block = offset >> blkbits;
-	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-			- block;
+	if (mode & FALLOC_FL_ZERO_RANGE)
+		return ext4_zero_range(file, offset, len, mode);
 
+	trace_ext4_fallocate_enter(inode, offset, len, mode);
+	lblk = offset >> blkbits;
 	/*
-	 * credits to insert 1 extent into extent tree + buffers to be able to
-	 * modify 1 super block, 1 block bitmap and 1 group descriptor.
+	 * We can't just convert len to max_blocks because
+	 * If blocksize = 4096 offset = 3072 and len = 2048
 	 */
-	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+		- lblk;
+
+	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+	if (mode & FALLOC_FL_KEEP_SIZE)
+		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
 	mutex_lock(&inode->i_mutex);
-retry:
-	while (ret >= 0 && ret < max_blocks) {
-		block = block + ret;
-		max_blocks = max_blocks - ret;
-		handle = ext4_journal_start(inode, credits);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			break;
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	     offset + len > i_size_read(inode)) {
+		new_size = offset + len;
+		ret = inode_newsize_ok(inode, new_size);
+		if (ret)
+			goto out;
+	}
+
+	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+	if (ret)
+		goto out;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle))
+		goto out;
+
+	tv = inode->i_ctime = ext4_current_time(inode);
+
+	if (new_size) {
+		if (new_size > i_size_read(inode)) {
+			i_size_write(inode, new_size);
+			inode->i_mtime = tv;
 		}
+		if (new_size > EXT4_I(inode)->i_disksize)
+			ext4_update_i_disksize(inode, new_size);
+	} else {
+		/*
+		* Mark that we allocate beyond EOF so the subsequent truncate
+		* can proceed even if the new size is the same as i_size.
+		*/
+		if ((offset + len) > i_size_read(inode))
+			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+	}
+	ext4_mark_inode_dirty(handle, inode);
+	if (file->f_flags & O_SYNC)
+		ext4_handle_sync(handle);
 
-		ret = ext4_get_blocks_wrap(handle, inode, block,
-					  max_blocks, &map_bh,
-					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
-		if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
-			WARN_ON(ret <= 0);
-			printk(KERN_ERR "%s: ext4_ext_get_blocks "
-				    "returned error inode#%lu, block=%u, "
-				    "max_blocks=%lu", __func__,
-				    inode->i_ino, block, max_blocks);
-#endif
-			ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+out:
+	mutex_unlock(&inode->i_mutex);
+	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+	return ret;
+}
+
+/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
+				   loff_t offset, ssize_t len)
+{
+	unsigned int max_blocks;
+	int ret = 0;
+	int ret2 = 0;
+	struct ext4_map_blocks map;
+	unsigned int credits, blkbits = inode->i_blkbits;
+
+	map.m_lblk = offset >> blkbits;
+	/*
+	 * We can't just convert len to max_blocks because
+	 * If blocksize = 4096 offset = 3072 and len = 2048
+	 */
+	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+		      map.m_lblk);
+	/*
+	 * This is somewhat ugly but the idea is clear: When transaction is
+	 * reserved, everything goes into it. Otherwise we rather start several
+	 * smaller transactions for conversion of each extent separately.
+	 */
+	if (handle) {
+		handle = ext4_journal_start_reserved(handle,
+						     EXT4_HT_EXT_CONVERT);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+		credits = 0;
+	} else {
+		/*
+		 * credits to insert 1 extent into extent tree
+		 */
+		credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	}
+	while (ret >= 0 && ret < max_blocks) {
+		map.m_lblk += ret;
+		map.m_len = (max_blocks -= ret);
+		if (credits) {
+			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+						    credits);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				break;
+			}
+		}
+		ret = ext4_map_blocks(handle, inode, &map,
+				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+		if (ret <= 0)
+			ext4_warning(inode->i_sb,
+				     "inode #%lu: block %u: len %u: "
+				     "ext4_ext_map_blocks returned %d",
+				     inode->i_ino, map.m_lblk,
+				     map.m_len, ret);
+		ext4_mark_inode_dirty(handle, inode);
+		if (credits)
 			ret2 = ext4_journal_stop(handle);
+		if (ret <= 0 || ret2)
 			break;
+	}
+	if (!credits)
+		ret2 = ext4_journal_stop(handle);
+	return ret > 0 ? ret2 : ret;
+}
+
+/*
+ * If newes is not existing extent (newes->ec_pblk equals zero) find
+ * delayed extent at start of newes and update newes accordingly and
+ * return start of the next delayed extent.
+ *
+ * If newes is existing extent (newes->ec_pblk is not equal zero)
+ * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
+ * extent found. Leave newes unmodified.
+ */
+static int ext4_find_delayed_extent(struct inode *inode,
+				    struct extent_status *newes)
+{
+	struct extent_status es;
+	ext4_lblk_t block, next_del;
+
+	if (newes->es_pblk == 0) {
+		ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
+				newes->es_lblk + newes->es_len - 1, &es);
+
+		/*
+		 * No extent in extent-tree contains block @newes->es_pblk,
+		 * then the block may stay in 1)a hole or 2)delayed-extent.
+		 */
+		if (es.es_len == 0)
+			/* A hole found. */
+			return 0;
+
+		if (es.es_lblk > newes->es_lblk) {
+			/* A hole found. */
+			newes->es_len = min(es.es_lblk - newes->es_lblk,
+					    newes->es_len);
+			return 0;
 		}
-		if (ret > 0) {
-			/* check wrap through sign-bit/zero here */
-			if ((block + ret) < 0 || (block + ret) < block) {
-				ret = -EIO;
-				ext4_mark_inode_dirty(handle, inode);
-				ret2 = ext4_journal_stop(handle);
-				break;
+
+		newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
+	}
+
+	block = newes->es_lblk + newes->es_len;
+	ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
+	if (es.es_len == 0)
+		next_del = EXT_MAX_BLOCKS;
+	else
+		next_del = es.es_lblk;
+
+	return next_del;
+}
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+static int ext4_xattr_fiemap(struct inode *inode,
+				struct fiemap_extent_info *fieinfo)
+{
+	__u64 physical = 0;
+	__u64 length;
+	__u32 flags = FIEMAP_EXTENT_LAST;
+	int blockbits = inode->i_sb->s_blocksize_bits;
+	int error = 0;
+
+	/* in-inode? */
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+		struct ext4_iloc iloc;
+		int offset;	/* offset of xattr in inode */
+
+		error = ext4_get_inode_loc(inode, &iloc);
+		if (error)
+			return error;
+		physical = (__u64)iloc.bh->b_blocknr << blockbits;
+		offset = EXT4_GOOD_OLD_INODE_SIZE +
+				EXT4_I(inode)->i_extra_isize;
+		physical += offset;
+		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+		flags |= FIEMAP_EXTENT_DATA_INLINE;
+		brelse(iloc.bh);
+	} else { /* external block */
+		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
+		length = inode->i_sb->s_blocksize;
+	}
+
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, 0, physical,
+						length, flags);
+	return (error < 0 ? error : 0);
+}
+
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
+{
+	ext4_lblk_t start_blk;
+	int error = 0;
+
+	if (ext4_has_inline_data(inode)) {
+		int has_inline = 1;
+
+		error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+
+		if (has_inline)
+			return error;
+	}
+
+	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+		error = ext4_ext_precache(inode);
+		if (error)
+			return error;
+	}
+
+	/* fallback to generic here if not in extents fmt */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return generic_block_fiemap(inode, fieinfo, start, len,
+			ext4_get_block);
+
+	if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+		return -EBADR;
+
+	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+		error = ext4_xattr_fiemap(inode, fieinfo);
+	} else {
+		ext4_lblk_t len_blks;
+		__u64 last_blk;
+
+		start_blk = start >> inode->i_sb->s_blocksize_bits;
+		last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
+		if (last_blk >= EXT_MAX_BLOCKS)
+			last_blk = EXT_MAX_BLOCKS-1;
+		len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
+
+		/*
+		 * Walk the extent tree gathering extent information
+		 * and pushing extents back to the user.
+		 */
+		error = ext4_fill_fiemap_extents(inode, start_blk,
+						 len_blks, fieinfo);
+	}
+	ext4_es_lru_add(inode);
+	return error;
+}
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+		struct ext4_ext_path *path)
+{
+	int credits, err;
+
+	if (!ext4_handle_valid(handle))
+		return 0;
+
+	/*
+	 * Check if need to extend journal credits
+	 * 3 for leaf, sb, and inode plus 2 (bmap and group
+	 * descriptor) for each block group; assume two block
+	 * groups
+	 */
+	if (handle->h_buffer_credits < 7) {
+		credits = ext4_writepage_trans_blocks(inode);
+		err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+		/* EAGAIN is success */
+		if (err && err != -EAGAIN)
+			return err;
+	}
+
+	err = ext4_ext_get_access(handle, inode, path);
+	return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+			    struct inode *inode, handle_t *handle,
+			    ext4_lblk_t *start)
+{
+	int depth, err = 0;
+	struct ext4_extent *ex_start, *ex_last;
+	bool update = 0;
+	depth = path->p_depth;
+
+	while (depth >= 0) {
+		if (depth == path->p_depth) {
+			ex_start = path[depth].p_ext;
+			if (!ex_start)
+				return -EIO;
+
+			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+			if (!ex_last)
+				return -EIO;
+
+			err = ext4_access_path(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+				update = 1;
+
+			*start = le32_to_cpu(ex_last->ee_block) +
+				ext4_ext_get_actual_len(ex_last);
+
+			while (ex_start <= ex_last) {
+				le32_add_cpu(&ex_start->ee_block, -shift);
+				/* Try to merge to the left. */
+				if ((ex_start >
+				     EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
+				    ext4_ext_try_to_merge_right(inode,
+							path, ex_start - 1))
+					ex_last--;
+				else
+					ex_start++;
 			}
-			if (buffer_new(&map_bh) && ((block + ret) >
-			    (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits)
-			    >> blkbits)))
-					nblocks = nblocks + ret;
+			err = ext4_ext_dirty(handle, inode, path + depth);
+			if (err)
+				goto out;
+
+			if (--depth < 0 || !update)
+				break;
 		}
 
-		/* Update ctime if new blocks get allocated */
-		if (nblocks) {
-			struct timespec now;
+		/* Update index too */
+		err = ext4_access_path(handle, inode, path + depth);
+		if (err)
+			goto out;
 
-			now = current_fs_time(inode->i_sb);
-			if (!timespec_equal(&inode->i_ctime, &now))
-				inode->i_ctime = now;
-		}
+		le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto out;
 
-		ext4_mark_inode_dirty(handle, inode);
-		ret2 = ext4_journal_stop(handle);
-		if (ret2)
+		/* we are done if current index is not a starting index */
+		if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
 			break;
+
+		depth--;
 	}
 
-	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
+out:
+	return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+		       ext4_lblk_t start, ext4_lblk_t shift)
+{
+	struct ext4_ext_path *path;
+	int ret = 0, depth;
+	struct ext4_extent *extent;
+	ext4_lblk_t stop_block, current_block;
+	ext4_lblk_t ex_start, ex_end;
+
+	/* Let path point to the last extent */
+	path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+
+	depth = path->p_depth;
+	extent = path[depth].p_ext;
+	if (!extent) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		return ret;
+	}
+
+	stop_block = le32_to_cpu(extent->ee_block) +
+			ext4_ext_get_actual_len(extent);
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	/* Nothing to shift, if hole is at the end of file */
+	if (start >= stop_block)
+		return ret;
 
 	/*
-	 * Time to update the file size.
-	 * Update only when preallocation was requested beyond the file size.
+	 * Don't start shifting extents until we make sure the hole is big
+	 * enough to accomodate the shift.
 	 */
-	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-	    (offset + len) > i_size_read(inode)) {
-		if (ret > 0) {
-			/*
-			 * if no error, we assume preallocation succeeded
-			 * completely
-			 */
-			i_size_write(inode, offset + len);
-			EXT4_I(inode)->i_disksize = i_size_read(inode);
-		} else if (ret < 0 && nblocks) {
-			/* Handle partial allocation scenario */
-			loff_t newsize;
+	path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	depth = path->p_depth;
+	extent =  path[depth].p_ext;
+	if (extent) {
+		ex_start = le32_to_cpu(extent->ee_block);
+		ex_end = le32_to_cpu(extent->ee_block) +
+			ext4_ext_get_actual_len(extent);
+	} else {
+		ex_start = 0;
+		ex_end = 0;
+	}
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	if ((start == ex_start && shift > ex_start) ||
+	    (shift > start - ex_end))
+		return -EINVAL;
+
+	/* Its safe to start updating extents */
+	while (start < stop_block) {
+		path = ext4_ext_find_extent(inode, start, NULL, 0);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+		depth = path->p_depth;
+		extent = path[depth].p_ext;
+		if (!extent) {
+			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+					 (unsigned long) start);
+			return -EIO;
+		}
 
-			newsize  = (nblocks << blkbits) + i_size_read(inode);
-			i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits));
-			EXT4_I(inode)->i_disksize = i_size_read(inode);
+		current_block = le32_to_cpu(extent->ee_block);
+		if (start > current_block) {
+			/* Hole, move to the next extent */
+			ret = mext_next_extent(inode, path, &extent);
+			if (ret != 0) {
+				ext4_ext_drop_refs(path);
+				kfree(path);
+				if (ret == 1)
+					ret = 0;
+				break;
+			}
 		}
+		ret = ext4_ext_shift_path_extents(path, shift, inode,
+				handle, &start);
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct super_block *sb = inode->i_sb;
+	ext4_lblk_t punch_start, punch_stop;
+	handle_t *handle;
+	unsigned int credits;
+	loff_t new_size, ioffset;
+	int ret;
+
+	/* Collapse range works only on fs block size aligned offsets. */
+	if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+	    len & (EXT4_BLOCK_SIZE(sb) - 1))
+		return -EINVAL;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
+		return -EOPNOTSUPP;
+
+	trace_ext4_collapse_range(inode, offset, len);
+
+	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+	/* Call ext4_force_commit to flush all data in case of data=journal. */
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * Need to round down offset to be aligned with page size boundary
+	 * for page size > block size.
+	 */
+	ioffset = round_down(offset, PAGE_SIZE);
+
+	/* Write out all dirty pages */
+	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+					   LLONG_MAX);
+	if (ret)
+		return ret;
+
+	/* Take mutex lock */
+	mutex_lock(&inode->i_mutex);
+
+	/*
+	 * There is no need to overlap collapse range with EOF, in which case
+	 * it is effectively a truncate operation
+	 */
+	if (offset + len >= i_size_read(inode)) {
+		ret = -EINVAL;
+		goto out_mutex;
+	}
+
+	/* Currently just for extent based files */
+	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		ret = -EOPNOTSUPP;
+		goto out_mutex;
+	}
+
+	truncate_pagecache(inode, ioffset);
+
+	/* Wait for existing dio to complete */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	credits = ext4_writepage_trans_blocks(inode);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_dio;
+	}
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
+
+	ret = ext4_es_remove_extent(inode, punch_start,
+				    EXT_MAX_BLOCKS - punch_start);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
 	}
 
+	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+	ext4_discard_preallocations(inode);
+
+	ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+				     punch_stop - punch_start);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
+
+	new_size = i_size_read(inode) - len;
+	i_size_write(inode, new_size);
+	EXT4_I(inode)->i_disksize = new_size;
+
+	up_write(&EXT4_I(inode)->i_data_sem);
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
 	mutex_unlock(&inode->i_mutex);
-	return ret > 0 ? ret2 : ret;
+	return ret;
 }
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
new file mode 100644
index 00000000000..0b7e28e7eaa
--- /dev/null
+++ b/fs/ext4/extents_status.c
@@ -0,0 +1,1127 @@
+/*
+ *  fs/ext4/extents_status.c
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ *	Allison Henderson <achender@linux.vnet.ibm.com>
+ *	Hugh Dickins <hughd@google.com>
+ *	Zheng Liu <wenqing.lz@taobao.com>
+ *
+ * Ext4 extents status tree core functions.
+ */
+#include <linux/rbtree.h>
+#include <linux/list_sort.h>
+#include "ext4.h"
+#include "extents_status.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * According to previous discussion in Ext4 Developer Workshop, we
+ * will introduce a new structure called io tree to track all extent
+ * status in order to solve some problems that we have met
+ * (e.g. Reservation space warning), and provide extent-level locking.
+ * Delay extent tree is the first step to achieve this goal.  It is
+ * original built by Yongqiang Yang.  At that time it is called delay
+ * extent tree, whose goal is only track delayed extents in memory to
+ * simplify the implementation of fiemap and bigalloc, and introduce
+ * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
+ * delay extent tree at the first commit.  But for better understand
+ * what it does, it has been rename to extent status tree.
+ *
+ * Step1:
+ * Currently the first step has been done.  All delayed extents are
+ * tracked in the tree.  It maintains the delayed extent when a delayed
+ * allocation is issued, and the delayed extent is written out or
+ * invalidated.  Therefore the implementation of fiemap and bigalloc
+ * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
+ *
+ * The following comment describes the implemenmtation of extent
+ * status tree and future works.
+ *
+ * Step2:
+ * In this step all extent status are tracked by extent status tree.
+ * Thus, we can first try to lookup a block mapping in this tree before
+ * finding it in extent tree.  Hence, single extent cache can be removed
+ * because extent status tree can do a better job.  Extents in status
+ * tree are loaded on-demand.  Therefore, the extent status tree may not
+ * contain all of the extents in a file.  Meanwhile we define a shrinker
+ * to reclaim memory from extent status tree because fragmented extent
+ * tree will make status tree cost too much memory.  written/unwritten/-
+ * hole extents in the tree will be reclaimed by this shrinker when we
+ * are under high memory pressure.  Delayed extents will not be
+ * reclimed because fiemap, bigalloc, and seek_data/hole need it.
+ */
+
+/*
+ * Extent status tree implementation for ext4.
+ *
+ *
+ * ==========================================================================
+ * Extent status tree tracks all extent status.
+ *
+ * 1. Why we need to implement extent status tree?
+ *
+ * Without extent status tree, ext4 identifies a delayed extent by looking
+ * up page cache, this has several deficiencies - complicated, buggy,
+ * and inefficient code.
+ *
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
+ * block or a range of blocks are belonged to a delayed extent.
+ *
+ * Let us have a look at how they do without extent status tree.
+ *   --	FIEMAP
+ *	FIEMAP looks up page cache to identify delayed allocations from holes.
+ *
+ *   --	SEEK_HOLE/DATA
+ *	SEEK_HOLE/DATA has the same problem as FIEMAP.
+ *
+ *   --	bigalloc
+ *	bigalloc looks up page cache to figure out if a block is
+ *	already under delayed allocation or not to determine whether
+ *	quota reserving is needed for the cluster.
+ *
+ *   --	writeout
+ *	Writeout looks up whole page cache to see if a buffer is
+ *	mapped, If there are not very many delayed buffers, then it is
+ *	time comsuming.
+ *
+ * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
+ * bigalloc and writeout can figure out if a block or a range of
+ * blocks is under delayed allocation(belonged to a delayed extent) or
+ * not by searching the extent tree.
+ *
+ *
+ * ==========================================================================
+ * 2. Ext4 extent status tree impelmentation
+ *
+ *   --	extent
+ *	A extent is a range of blocks which are contiguous logically and
+ *	physically.  Unlike extent in extent tree, this extent in ext4 is
+ *	a in-memory struct, there is no corresponding on-disk data.  There
+ *	is no limit on length of extent, so an extent can contain as many
+ *	blocks as they are contiguous logically and physically.
+ *
+ *   --	extent status tree
+ *	Every inode has an extent status tree and all allocation blocks
+ *	are added to the tree with different status.  The extent in the
+ *	tree are ordered by logical block no.
+ *
+ *   --	operations on a extent status tree
+ *	There are three important operations on a delayed extent tree: find
+ *	next extent, adding a extent(a range of blocks) and removing a extent.
+ *
+ *   --	race on a extent status tree
+ *	Extent status tree is protected by inode->i_es_lock.
+ *
+ *   --	memory consumption
+ *      Fragmented extent tree will make extent status tree cost too much
+ *      memory.  Hence, we will reclaim written/unwritten/hole extents from
+ *      the tree under a heavy memory pressure.
+ *
+ *
+ * ==========================================================================
+ * 3. Performance analysis
+ *
+ *   --	overhead
+ *	1. There is a cache extent for write access, so if writes are
+ *	not very random, adding space operaions are in O(1) time.
+ *
+ *   --	gain
+ *	2. Code is much simpler, more readable, more maintainable and
+ *	more efficient.
+ *
+ *
+ * ==========================================================================
+ * 4. TODO list
+ *
+ *   -- Refactor delayed space reservation
+ *
+ *   -- Extent-level locking
+ */
+
+static struct kmem_cache *ext4_es_cachep;
+
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_lblk_t end);
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+				       int nr_to_scan);
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+			    struct ext4_inode_info *locked_ei);
+
+int __init ext4_init_es(void)
+{
+	ext4_es_cachep = kmem_cache_create("ext4_extent_status",
+					   sizeof(struct extent_status),
+					   0, (SLAB_RECLAIM_ACCOUNT), NULL);
+	if (ext4_es_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ext4_exit_es(void)
+{
+	if (ext4_es_cachep)
+		kmem_cache_destroy(ext4_es_cachep);
+}
+
+void ext4_es_init_tree(struct ext4_es_tree *tree)
+{
+	tree->root = RB_ROOT;
+	tree->cache_es = NULL;
+}
+
+#ifdef ES_DEBUG__
+static void ext4_es_print_tree(struct inode *inode)
+{
+	struct ext4_es_tree *tree;
+	struct rb_node *node;
+
+	printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
+	tree = &EXT4_I(inode)->i_es_tree;
+	node = rb_first(&tree->root);
+	while (node) {
+		struct extent_status *es;
+		es = rb_entry(node, struct extent_status, rb_node);
+		printk(KERN_DEBUG " [%u/%u) %llu %x",
+		       es->es_lblk, es->es_len,
+		       ext4_es_pblock(es), ext4_es_status(es));
+		node = rb_next(node);
+	}
+	printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_es_print_tree(inode)
+#endif
+
+static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
+{
+	BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
+	return es->es_lblk + es->es_len - 1;
+}
+
+/*
+ * search through the tree for an delayed extent with a given offset.  If
+ * it can't be found, try to find next extent.
+ */
+static struct extent_status *__es_tree_search(struct rb_root *root,
+					      ext4_lblk_t lblk)
+{
+	struct rb_node *node = root->rb_node;
+	struct extent_status *es = NULL;
+
+	while (node) {
+		es = rb_entry(node, struct extent_status, rb_node);
+		if (lblk < es->es_lblk)
+			node = node->rb_left;
+		else if (lblk > ext4_es_end(es))
+			node = node->rb_right;
+		else
+			return es;
+	}
+
+	if (es && lblk < es->es_lblk)
+		return es;
+
+	if (es && lblk > ext4_es_end(es)) {
+		node = rb_next(&es->rb_node);
+		return node ? rb_entry(node, struct extent_status, rb_node) :
+			      NULL;
+	}
+
+	return NULL;
+}
+
+/*
+ * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
+ * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
+ *
+ * @inode: the inode which owns delayed extents
+ * @lblk: the offset where we start to search
+ * @end: the offset where we stop to search
+ * @es: delayed extent that we found
+ */
+void ext4_es_find_delayed_extent_range(struct inode *inode,
+				 ext4_lblk_t lblk, ext4_lblk_t end,
+				 struct extent_status *es)
+{
+	struct ext4_es_tree *tree = NULL;
+	struct extent_status *es1 = NULL;
+	struct rb_node *node;
+
+	BUG_ON(es == NULL);
+	BUG_ON(end < lblk);
+	trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
+
+	read_lock(&EXT4_I(inode)->i_es_lock);
+	tree = &EXT4_I(inode)->i_es_tree;
+
+	/* find extent in cache firstly */
+	es->es_lblk = es->es_len = es->es_pblk = 0;
+	if (tree->cache_es) {
+		es1 = tree->cache_es;
+		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+			es_debug("%u cached by [%u/%u) %llu %x\n",
+				 lblk, es1->es_lblk, es1->es_len,
+				 ext4_es_pblock(es1), ext4_es_status(es1));
+			goto out;
+		}
+	}
+
+	es1 = __es_tree_search(&tree->root, lblk);
+
+out:
+	if (es1 && !ext4_es_is_delayed(es1)) {
+		while ((node = rb_next(&es1->rb_node)) != NULL) {
+			es1 = rb_entry(node, struct extent_status, rb_node);
+			if (es1->es_lblk > end) {
+				es1 = NULL;
+				break;
+			}
+			if (ext4_es_is_delayed(es1))
+				break;
+		}
+	}
+
+	if (es1 && ext4_es_is_delayed(es1)) {
+		tree->cache_es = es1;
+		es->es_lblk = es1->es_lblk;
+		es->es_len = es1->es_len;
+		es->es_pblk = es1->es_pblk;
+	}
+
+	read_unlock(&EXT4_I(inode)->i_es_lock);
+
+	trace_ext4_es_find_delayed_extent_range_exit(inode, es);
+}
+
+static struct extent_status *
+ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+		     ext4_fsblk_t pblk)
+{
+	struct extent_status *es;
+	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+	if (es == NULL)
+		return NULL;
+	es->es_lblk = lblk;
+	es->es_len = len;
+	es->es_pblk = pblk;
+
+	/*
+	 * We don't count delayed extent because we never try to reclaim them
+	 */
+	if (!ext4_es_is_delayed(es)) {
+		EXT4_I(inode)->i_es_lru_nr++;
+		percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+	}
+
+	return es;
+}
+
+static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
+{
+	/* Decrease the lru counter when this es is not delayed */
+	if (!ext4_es_is_delayed(es)) {
+		BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+		EXT4_I(inode)->i_es_lru_nr--;
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
+	}
+
+	kmem_cache_free(ext4_es_cachep, es);
+}
+
+/*
+ * Check whether or not two extents can be merged
+ * Condition:
+ *  - logical block number is contiguous
+ *  - physical block number is contiguous
+ *  - status is equal
+ */
+static int ext4_es_can_be_merged(struct extent_status *es1,
+				 struct extent_status *es2)
+{
+	if (ext4_es_status(es1) != ext4_es_status(es2))
+		return 0;
+
+	if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
+		pr_warn("ES assertion failed when merging extents. "
+			"The sum of lengths of es1 (%d) and es2 (%d) "
+			"is bigger than allowed file size (%d)\n",
+			es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
+		WARN_ON(1);
+		return 0;
+	}
+
+	if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
+		return 0;
+
+	if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
+	    (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
+		return 1;
+
+	if (ext4_es_is_hole(es1))
+		return 1;
+
+	/* we need to check delayed extent is without unwritten status */
+	if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+		return 1;
+
+	return 0;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct extent_status *es1;
+	struct rb_node *node;
+
+	node = rb_prev(&es->rb_node);
+	if (!node)
+		return es;
+
+	es1 = rb_entry(node, struct extent_status, rb_node);
+	if (ext4_es_can_be_merged(es1, es)) {
+		es1->es_len += es->es_len;
+		rb_erase(&es->rb_node, &tree->root);
+		ext4_es_free_extent(inode, es);
+		es = es1;
+	}
+
+	return es;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct extent_status *es1;
+	struct rb_node *node;
+
+	node = rb_next(&es->rb_node);
+	if (!node)
+		return es;
+
+	es1 = rb_entry(node, struct extent_status, rb_node);
+	if (ext4_es_can_be_merged(es, es1)) {
+		es->es_len += es1->es_len;
+		rb_erase(node, &tree->root);
+		ext4_es_free_extent(inode, es1);
+	}
+
+	return es;
+}
+
+#ifdef ES_AGGRESSIVE_TEST
+#include "ext4_extents.h"	/* Needed when ES_AGGRESSIVE_TEST is defined */
+
+static void ext4_es_insert_extent_ext_check(struct inode *inode,
+					    struct extent_status *es)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ex;
+	ext4_lblk_t ee_block;
+	ext4_fsblk_t ee_start;
+	unsigned short ee_len;
+	int depth, ee_status, es_status;
+
+	path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
+	if (IS_ERR(path))
+		return;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+
+	if (ex) {
+
+		ee_block = le32_to_cpu(ex->ee_block);
+		ee_start = ext4_ext_pblock(ex);
+		ee_len = ext4_ext_get_actual_len(ex);
+
+		ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
+		es_status = ext4_es_is_unwritten(es) ? 1 : 0;
+
+		/*
+		 * Make sure ex and es are not overlap when we try to insert
+		 * a delayed/hole extent.
+		 */
+		if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
+			if (in_range(es->es_lblk, ee_block, ee_len)) {
+				pr_warn("ES insert assertion failed for "
+					"inode: %lu we can find an extent "
+					"at block [%d/%d/%llu/%c], but we "
+					"want to add a delayed/hole extent "
+					"[%d/%d/%llu/%x]\n",
+					inode->i_ino, ee_block, ee_len,
+					ee_start, ee_status ? 'u' : 'w',
+					es->es_lblk, es->es_len,
+					ext4_es_pblock(es), ext4_es_status(es));
+			}
+			goto out;
+		}
+
+		/*
+		 * We don't check ee_block == es->es_lblk, etc. because es
+		 * might be a part of whole extent, vice versa.
+		 */
+		if (es->es_lblk < ee_block ||
+		    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"ex_status [%d/%d/%llu/%c] != "
+				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+				ee_block, ee_len, ee_start,
+				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+				ext4_es_pblock(es), es_status ? 'u' : 'w');
+			goto out;
+		}
+
+		if (ee_status ^ es_status) {
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"ex_status [%d/%d/%llu/%c] != "
+				"es_status [%d/%d/%llu/%c]\n", inode->i_ino,
+				ee_block, ee_len, ee_start,
+				ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
+				ext4_es_pblock(es), es_status ? 'u' : 'w');
+		}
+	} else {
+		/*
+		 * We can't find an extent on disk.  So we need to make sure
+		 * that we don't want to add an written/unwritten extent.
+		 */
+		if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"can't find an extent at block %d but we want "
+				"to add a written/unwritten extent "
+				"[%d/%d/%llu/%x]\n", inode->i_ino,
+				es->es_lblk, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+		}
+	}
+out:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+}
+
+static void ext4_es_insert_extent_ind_check(struct inode *inode,
+					    struct extent_status *es)
+{
+	struct ext4_map_blocks map;
+	int retval;
+
+	/*
+	 * Here we call ext4_ind_map_blocks to lookup a block mapping because
+	 * 'Indirect' structure is defined in indirect.c.  So we couldn't
+	 * access direct/indirect tree from outside.  It is too dirty to define
+	 * this function in indirect.c file.
+	 */
+
+	map.m_lblk = es->es_lblk;
+	map.m_len = es->es_len;
+
+	retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
+	if (retval > 0) {
+		if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
+			/*
+			 * We want to add a delayed/hole extent but this
+			 * block has been allocated.
+			 */
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"We can find blocks but we want to add a "
+				"delayed/hole extent [%d/%d/%llu/%x]\n",
+				inode->i_ino, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+			return;
+		} else if (ext4_es_is_written(es)) {
+			if (retval != es->es_len) {
+				pr_warn("ES insert assertion failed for "
+					"inode: %lu retval %d != es_len %d\n",
+					inode->i_ino, retval, es->es_len);
+				return;
+			}
+			if (map.m_pblk != ext4_es_pblock(es)) {
+				pr_warn("ES insert assertion failed for "
+					"inode: %lu m_pblk %llu != "
+					"es_pblk %llu\n",
+					inode->i_ino, map.m_pblk,
+					ext4_es_pblock(es));
+				return;
+			}
+		} else {
+			/*
+			 * We don't need to check unwritten extent because
+			 * indirect-based file doesn't have it.
+			 */
+			BUG_ON(1);
+		}
+	} else if (retval == 0) {
+		if (ext4_es_is_written(es)) {
+			pr_warn("ES insert assertion failed for inode: %lu "
+				"We can't find the block but we want to add "
+				"a written extent [%d/%d/%llu/%x]\n",
+				inode->i_ino, es->es_lblk, es->es_len,
+				ext4_es_pblock(es), ext4_es_status(es));
+			return;
+		}
+	}
+}
+
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+					       struct extent_status *es)
+{
+	/*
+	 * We don't need to worry about the race condition because
+	 * caller takes i_data_sem locking.
+	 */
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ext4_es_insert_extent_ext_check(inode, es);
+	else
+		ext4_es_insert_extent_ind_check(inode, es);
+}
+#else
+static inline void ext4_es_insert_extent_check(struct inode *inode,
+					       struct extent_status *es)
+{
+}
+#endif
+
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct rb_node **p = &tree->root.rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_status *es;
+
+	while (*p) {
+		parent = *p;
+		es = rb_entry(parent, struct extent_status, rb_node);
+
+		if (newes->es_lblk < es->es_lblk) {
+			if (ext4_es_can_be_merged(newes, es)) {
+				/*
+				 * Here we can modify es_lblk directly
+				 * because it isn't overlapped.
+				 */
+				es->es_lblk = newes->es_lblk;
+				es->es_len += newes->es_len;
+				if (ext4_es_is_written(es) ||
+				    ext4_es_is_unwritten(es))
+					ext4_es_store_pblock(es,
+							     newes->es_pblk);
+				es = ext4_es_try_to_merge_left(inode, es);
+				goto out;
+			}
+			p = &(*p)->rb_left;
+		} else if (newes->es_lblk > ext4_es_end(es)) {
+			if (ext4_es_can_be_merged(es, newes)) {
+				es->es_len += newes->es_len;
+				es = ext4_es_try_to_merge_right(inode, es);
+				goto out;
+			}
+			p = &(*p)->rb_right;
+		} else {
+			BUG_ON(1);
+			return -EINVAL;
+		}
+	}
+
+	es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
+				  newes->es_pblk);
+	if (!es)
+		return -ENOMEM;
+	rb_link_node(&es->rb_node, parent, p);
+	rb_insert_color(&es->rb_node, &tree->root);
+
+out:
+	tree->cache_es = es;
+	return 0;
+}
+
+/*
+ * ext4_es_insert_extent() adds information to an inode's extent
+ * status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t len, ext4_fsblk_t pblk,
+			  unsigned int status)
+{
+	struct extent_status newes;
+	ext4_lblk_t end = lblk + len - 1;
+	int err = 0;
+
+	es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
+		 lblk, len, pblk, status, inode->i_ino);
+
+	if (!len)
+		return 0;
+
+	BUG_ON(end < lblk);
+
+	newes.es_lblk = lblk;
+	newes.es_len = len;
+	ext4_es_store_pblock_status(&newes, pblk, status);
+	trace_ext4_es_insert_extent(inode, &newes);
+
+	ext4_es_insert_extent_check(inode, &newes);
+
+	write_lock(&EXT4_I(inode)->i_es_lock);
+	err = __es_remove_extent(inode, lblk, end);
+	if (err != 0)
+		goto error;
+retry:
+	err = __es_insert_extent(inode, &newes);
+	if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+					       EXT4_I(inode)))
+		goto retry;
+	if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
+		err = 0;
+
+error:
+	write_unlock(&EXT4_I(inode)->i_es_lock);
+
+	ext4_es_print_tree(inode);
+
+	return err;
+}
+
+/*
+ * ext4_es_cache_extent() inserts information into the extent status
+ * tree if and only if there isn't information about the range in
+ * question already.
+ */
+void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t len, ext4_fsblk_t pblk,
+			  unsigned int status)
+{
+	struct extent_status *es;
+	struct extent_status newes;
+	ext4_lblk_t end = lblk + len - 1;
+
+	newes.es_lblk = lblk;
+	newes.es_len = len;
+	ext4_es_store_pblock_status(&newes, pblk, status);
+	trace_ext4_es_cache_extent(inode, &newes);
+
+	if (!len)
+		return;
+
+	BUG_ON(end < lblk);
+
+	write_lock(&EXT4_I(inode)->i_es_lock);
+
+	es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
+	if (!es || es->es_lblk > end)
+		__es_insert_extent(inode, &newes);
+	write_unlock(&EXT4_I(inode)->i_es_lock);
+}
+
+/*
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
+ *
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
+ */
+int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+			  struct extent_status *es)
+{
+	struct ext4_es_tree *tree;
+	struct extent_status *es1 = NULL;
+	struct rb_node *node;
+	int found = 0;
+
+	trace_ext4_es_lookup_extent_enter(inode, lblk);
+	es_debug("lookup extent in block %u\n", lblk);
+
+	tree = &EXT4_I(inode)->i_es_tree;
+	read_lock(&EXT4_I(inode)->i_es_lock);
+
+	/* find extent in cache firstly */
+	es->es_lblk = es->es_len = es->es_pblk = 0;
+	if (tree->cache_es) {
+		es1 = tree->cache_es;
+		if (in_range(lblk, es1->es_lblk, es1->es_len)) {
+			es_debug("%u cached by [%u/%u)\n",
+				 lblk, es1->es_lblk, es1->es_len);
+			found = 1;
+			goto out;
+		}
+	}
+
+	node = tree->root.rb_node;
+	while (node) {
+		es1 = rb_entry(node, struct extent_status, rb_node);
+		if (lblk < es1->es_lblk)
+			node = node->rb_left;
+		else if (lblk > ext4_es_end(es1))
+			node = node->rb_right;
+		else {
+			found = 1;
+			break;
+		}
+	}
+
+out:
+	if (found) {
+		BUG_ON(!es1);
+		es->es_lblk = es1->es_lblk;
+		es->es_len = es1->es_len;
+		es->es_pblk = es1->es_pblk;
+	}
+
+	read_unlock(&EXT4_I(inode)->i_es_lock);
+
+	trace_ext4_es_lookup_extent_exit(inode, es, found);
+	return found;
+}
+
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_lblk_t end)
+{
+	struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+	struct rb_node *node;
+	struct extent_status *es;
+	struct extent_status orig_es;
+	ext4_lblk_t len1, len2;
+	ext4_fsblk_t block;
+	int err;
+
+retry:
+	err = 0;
+	es = __es_tree_search(&tree->root, lblk);
+	if (!es)
+		goto out;
+	if (es->es_lblk > end)
+		goto out;
+
+	/* Simply invalidate cache_es. */
+	tree->cache_es = NULL;
+
+	orig_es.es_lblk = es->es_lblk;
+	orig_es.es_len = es->es_len;
+	orig_es.es_pblk = es->es_pblk;
+
+	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+	len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
+	if (len1 > 0)
+		es->es_len = len1;
+	if (len2 > 0) {
+		if (len1 > 0) {
+			struct extent_status newes;
+
+			newes.es_lblk = end + 1;
+			newes.es_len = len2;
+			block = 0x7FDEADBEEFULL;
+			if (ext4_es_is_written(&orig_es) ||
+			    ext4_es_is_unwritten(&orig_es))
+				block = ext4_es_pblock(&orig_es) +
+					orig_es.es_len - len2;
+			ext4_es_store_pblock_status(&newes, block,
+						    ext4_es_status(&orig_es));
+			err = __es_insert_extent(inode, &newes);
+			if (err) {
+				es->es_lblk = orig_es.es_lblk;
+				es->es_len = orig_es.es_len;
+				if ((err == -ENOMEM) &&
+				    __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
+						     EXT4_I(inode)))
+					goto retry;
+				goto out;
+			}
+		} else {
+			es->es_lblk = end + 1;
+			es->es_len = len2;
+			if (ext4_es_is_written(es) ||
+			    ext4_es_is_unwritten(es)) {
+				block = orig_es.es_pblk + orig_es.es_len - len2;
+				ext4_es_store_pblock(es, block);
+			}
+		}
+		goto out;
+	}
+
+	if (len1 > 0) {
+		node = rb_next(&es->rb_node);
+		if (node)
+			es = rb_entry(node, struct extent_status, rb_node);
+		else
+			es = NULL;
+	}
+
+	while (es && ext4_es_end(es) <= end) {
+		node = rb_next(&es->rb_node);
+		rb_erase(&es->rb_node, &tree->root);
+		ext4_es_free_extent(inode, es);
+		if (!node) {
+			es = NULL;
+			break;
+		}
+		es = rb_entry(node, struct extent_status, rb_node);
+	}
+
+	if (es && es->es_lblk < end + 1) {
+		ext4_lblk_t orig_len = es->es_len;
+
+		len1 = ext4_es_end(es) - end;
+		es->es_lblk = end + 1;
+		es->es_len = len1;
+		if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
+			block = es->es_pblk + orig_len - len1;
+			ext4_es_store_pblock(es, block);
+		}
+	}
+
+out:
+	return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a extent status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+			  ext4_lblk_t len)
+{
+	ext4_lblk_t end;
+	int err = 0;
+
+	trace_ext4_es_remove_extent(inode, lblk, len);
+	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+		 lblk, len, inode->i_ino);
+
+	if (!len)
+		return err;
+
+	end = lblk + len - 1;
+	BUG_ON(end < lblk);
+
+	write_lock(&EXT4_I(inode)->i_es_lock);
+	err = __es_remove_extent(inode, lblk, end);
+	write_unlock(&EXT4_I(inode)->i_es_lock);
+	ext4_es_print_tree(inode);
+	return err;
+}
+
+static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
+				     struct list_head *b)
+{
+	struct ext4_inode_info *eia, *eib;
+	eia = list_entry(a, struct ext4_inode_info, i_es_lru);
+	eib = list_entry(b, struct ext4_inode_info, i_es_lru);
+
+	if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+	    !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+		return 1;
+	if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
+	    ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
+		return -1;
+	if (eia->i_touch_when == eib->i_touch_when)
+		return 0;
+	if (time_after(eia->i_touch_when, eib->i_touch_when))
+		return 1;
+	else
+		return -1;
+}
+
+static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
+			    struct ext4_inode_info *locked_ei)
+{
+	struct ext4_inode_info *ei;
+	struct list_head *cur, *tmp;
+	LIST_HEAD(skipped);
+	int nr_shrunk = 0;
+	int retried = 0, skip_precached = 1, nr_skipped = 0;
+
+	spin_lock(&sbi->s_es_lru_lock);
+
+retry:
+	list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+		int shrunk;
+
+		/*
+		 * If we have already reclaimed all extents from extent
+		 * status tree, just stop the loop immediately.
+		 */
+		if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
+			break;
+
+		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+
+		/*
+		 * Skip the inode that is newer than the last_sorted
+		 * time.  Normally we try hard to avoid shrinking
+		 * precached inodes, but we will as a last resort.
+		 */
+		if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
+		    (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
+						EXT4_STATE_EXT_PRECACHED))) {
+			nr_skipped++;
+			list_move_tail(cur, &skipped);
+			continue;
+		}
+
+		if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
+		    !write_trylock(&ei->i_es_lock))
+			continue;
+
+		shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
+		if (ei->i_es_lru_nr == 0)
+			list_del_init(&ei->i_es_lru);
+		write_unlock(&ei->i_es_lock);
+
+		nr_shrunk += shrunk;
+		nr_to_scan -= shrunk;
+		if (nr_to_scan == 0)
+			break;
+	}
+
+	/* Move the newer inodes into the tail of the LRU list. */
+	list_splice_tail(&skipped, &sbi->s_es_lru);
+	INIT_LIST_HEAD(&skipped);
+
+	/*
+	 * If we skipped any inodes, and we weren't able to make any
+	 * forward progress, sort the list and try again.
+	 */
+	if ((nr_shrunk == 0) && nr_skipped && !retried) {
+		retried++;
+		list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
+		sbi->s_es_last_sorted = jiffies;
+		ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
+				      i_es_lru);
+		/*
+		 * If there are no non-precached inodes left on the
+		 * list, start releasing precached extents.
+		 */
+		if (ext4_test_inode_state(&ei->vfs_inode,
+					  EXT4_STATE_EXT_PRECACHED))
+			skip_precached = 0;
+		goto retry;
+	}
+
+	spin_unlock(&sbi->s_es_lru_lock);
+
+	if (locked_ei && nr_shrunk == 0)
+		nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
+
+	return nr_shrunk;
+}
+
+static unsigned long ext4_es_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	unsigned long nr;
+	struct ext4_sb_info *sbi;
+
+	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+	nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+	trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
+	return nr;
+}
+
+static unsigned long ext4_es_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
+{
+	struct ext4_sb_info *sbi = container_of(shrink,
+					struct ext4_sb_info, s_es_shrinker);
+	int nr_to_scan = sc->nr_to_scan;
+	int ret, nr_shrunk;
+
+	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+	trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
+
+	if (!nr_to_scan)
+		return ret;
+
+	nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
+
+	trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
+	return nr_shrunk;
+}
+
+void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
+{
+	INIT_LIST_HEAD(&sbi->s_es_lru);
+	spin_lock_init(&sbi->s_es_lru_lock);
+	sbi->s_es_last_sorted = 0;
+	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
+	sbi->s_es_shrinker.count_objects = ext4_es_count;
+	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+	register_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
+{
+	unregister_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_lru_add(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+	ei->i_touch_when = jiffies;
+
+	if (!list_empty(&ei->i_es_lru))
+		return;
+
+	spin_lock(&sbi->s_es_lru_lock);
+	if (list_empty(&ei->i_es_lru))
+		list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+	spin_unlock(&sbi->s_es_lru_lock);
+}
+
+void ext4_es_lru_del(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+	spin_lock(&sbi->s_es_lru_lock);
+	if (!list_empty(&ei->i_es_lru))
+		list_del_init(&ei->i_es_lru);
+	spin_unlock(&sbi->s_es_lru_lock);
+}
+
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+				       int nr_to_scan)
+{
+	struct inode *inode = &ei->vfs_inode;
+	struct ext4_es_tree *tree = &ei->i_es_tree;
+	struct rb_node *node;
+	struct extent_status *es;
+	unsigned long nr_shrunk = 0;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	if (ei->i_es_lru_nr == 0)
+		return 0;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
+	    __ratelimit(&_rs))
+		ext4_warning(inode->i_sb, "forced shrink of precached extents");
+
+	node = rb_first(&tree->root);
+	while (node != NULL) {
+		es = rb_entry(node, struct extent_status, rb_node);
+		node = rb_next(&es->rb_node);
+		/*
+		 * We can't reclaim delayed extent from status tree because
+		 * fiemap, bigallic, and seek_data/hole need to use it.
+		 */
+		if (!ext4_es_is_delayed(es)) {
+			rb_erase(&es->rb_node, &tree->root);
+			ext4_es_free_extent(inode, es);
+			nr_shrunk++;
+			if (--nr_to_scan == 0)
+				break;
+		}
+	}
+	tree->cache_es = NULL;
+	return nr_shrunk;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
new file mode 100644
index 00000000000..f1b62a41992
--- /dev/null
+++ b/fs/ext4/extents_status.h
@@ -0,0 +1,146 @@
+/*
+ *  fs/ext4/extents_status.h
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ *	Allison Henderson <achender@linux.vnet.ibm.com>
+ *	Zheng Liu <wenqing.lz@taobao.com>
+ *
+ */
+
+#ifndef _EXT4_EXTENTS_STATUS_H
+#define _EXT4_EXTENTS_STATUS_H
+
+/*
+ * Turn on ES_DEBUG__ to get lots of info about extent status operations.
+ */
+#ifdef ES_DEBUG__
+#define es_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
+#else
+#define es_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+/*
+ * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
+ * checked with old map_block's result.
+ */
+#define ES_AGGRESSIVE_TEST__
+
+/*
+ * These flags live in the high bits of extent_status.es_pblk
+ */
+#define ES_SHIFT	60
+
+#define EXTENT_STATUS_WRITTEN	(1 << 3)
+#define EXTENT_STATUS_UNWRITTEN (1 << 2)
+#define EXTENT_STATUS_DELAYED	(1 << 1)
+#define EXTENT_STATUS_HOLE	(1 << 0)
+
+#define EXTENT_STATUS_FLAGS	(EXTENT_STATUS_WRITTEN | \
+				 EXTENT_STATUS_UNWRITTEN | \
+				 EXTENT_STATUS_DELAYED | \
+				 EXTENT_STATUS_HOLE)
+
+#define ES_WRITTEN		(1ULL << 63)
+#define ES_UNWRITTEN		(1ULL << 62)
+#define ES_DELAYED		(1ULL << 61)
+#define ES_HOLE			(1ULL << 60)
+
+#define ES_MASK			(ES_WRITTEN | ES_UNWRITTEN | \
+				 ES_DELAYED | ES_HOLE)
+
+struct ext4_sb_info;
+struct ext4_extent;
+
+struct extent_status {
+	struct rb_node rb_node;
+	ext4_lblk_t es_lblk;	/* first logical block extent covers */
+	ext4_lblk_t es_len;	/* length of extent in block */
+	ext4_fsblk_t es_pblk;	/* first physical block */
+};
+
+struct ext4_es_tree {
+	struct rb_root root;
+	struct extent_status *cache_es;	/* recently accessed extent */
+};
+
+extern int __init ext4_init_es(void);
+extern void ext4_exit_es(void);
+extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t len, ext4_fsblk_t pblk,
+				 unsigned int status);
+extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t len, ext4_fsblk_t pblk,
+				 unsigned int status);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+				 ext4_lblk_t len);
+extern void ext4_es_find_delayed_extent_range(struct inode *inode,
+					ext4_lblk_t lblk, ext4_lblk_t end,
+					struct extent_status *es);
+extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
+				 struct extent_status *es);
+
+static inline int ext4_es_is_written(struct extent_status *es)
+{
+	return (es->es_pblk & ES_WRITTEN) != 0;
+}
+
+static inline int ext4_es_is_unwritten(struct extent_status *es)
+{
+	return (es->es_pblk & ES_UNWRITTEN) != 0;
+}
+
+static inline int ext4_es_is_delayed(struct extent_status *es)
+{
+	return (es->es_pblk & ES_DELAYED) != 0;
+}
+
+static inline int ext4_es_is_hole(struct extent_status *es)
+{
+	return (es->es_pblk & ES_HOLE) != 0;
+}
+
+static inline unsigned int ext4_es_status(struct extent_status *es)
+{
+	return es->es_pblk >> ES_SHIFT;
+}
+
+static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
+{
+	return es->es_pblk & ~ES_MASK;
+}
+
+static inline void ext4_es_store_pblock(struct extent_status *es,
+					ext4_fsblk_t pb)
+{
+	ext4_fsblk_t block;
+
+	block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
+	es->es_pblk = block;
+}
+
+static inline void ext4_es_store_status(struct extent_status *es,
+					unsigned int status)
+{
+	es->es_pblk = (((ext4_fsblk_t)
+			(status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+		       (es->es_pblk & ~ES_MASK));
+}
+
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+					       ext4_fsblk_t pb,
+					       unsigned int status)
+{
+	es->es_pblk = (((ext4_fsblk_t)
+			(status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+		       (pb & ~ES_MASK));
+}
+
+extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+extern void ext4_es_lru_add(struct inode *inode);
+extern void ext4_es_lru_del(struct inode *inode);
+
+#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index ac35ec58db5..8695f70af1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -21,8 +21,13 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/aio.h>
+#include <linux/quotaops.h>
+#include <linux/pagevec.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
 
@@ -31,14 +36,19 @@
  * from ext4_file_open: open gets called at every open, but release
  * gets called only when /all/ the files are closed.
  */
-static int ext4_release_file (struct inode * inode, struct file * filp)
+static int ext4_release_file(struct inode *inode, struct file *filp)
 {
+	if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
+		ext4_alloc_da_blocks(inode);
+		ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1))
+			(atomic_read(&inode->i_writecount) == 1) &&
+		        !EXT4_I(inode)->i_reserved_data_blocks)
 	{
 		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_discard_reservation(inode);
+		ext4_discard_preallocations(inode);
 		up_write(&EXT4_I(inode)->i_data_sem);
 	}
 	if (is_dx(inode) && filp->private_data)
@@ -47,110 +57,562 @@ static int ext4_release_file (struct inode * inode, struct file * filp)
 	return 0;
 }
 
+static void ext4_unwritten_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
+{
+	struct super_block *sb = inode->i_sb;
+	int blockmask = sb->s_blocksize - 1;
+
+	if (pos >= i_size_read(inode))
+		return 0;
+
+	if ((pos | iov_iter_alignment(from)) & blockmask)
+		return 1;
+
+	return 0;
+}
+
 static ssize_t
-ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct mutex *aio_mutex = NULL;
+	struct blk_plug plug;
+	int o_direct = file->f_flags & O_DIRECT;
+	int overwrite = 0;
+	size_t length = iov_iter_count(from);
 	ssize_t ret;
-	int err;
+	loff_t pos = iocb->ki_pos;
+
+	/*
+	 * Unaligned direct AIO must be serialized; see comment above
+	 * In the case of O_APPEND, assume that we must always serialize
+	 */
+	if (o_direct &&
+	    ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+	    !is_sync_kiocb(iocb) &&
+	    (file->f_flags & O_APPEND ||
+	     ext4_unaligned_aio(inode, from, pos))) {
+		aio_mutex = ext4_aio_mutex(inode);
+		mutex_lock(aio_mutex);
+		ext4_unwritten_wait(inode);
+	}
+
+	mutex_lock(&inode->i_mutex);
+	if (file->f_flags & O_APPEND)
+		iocb->ki_pos = pos = i_size_read(inode);
 
 	/*
 	 * If we have encountered a bitmap-format file, the size limit
 	 * is smaller than s_maxbytes, which is for extent-mapped files.
 	 */
-
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-		size_t length = iov_length(iov, nr_segs);
 
-		if (pos > sbi->s_bitmap_maxbytes)
-			return -EFBIG;
+		if ((pos > sbi->s_bitmap_maxbytes) ||
+		    (pos == sbi->s_bitmap_maxbytes && length > 0)) {
+			mutex_unlock(&inode->i_mutex);
+			ret = -EFBIG;
+			goto errout;
+		}
+
+		if (pos + length > sbi->s_bitmap_maxbytes)
+			iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
+	}
+
+	if (o_direct) {
+		blk_start_plug(&plug);
+
+		iocb->private = &overwrite;
+
+		/* check whether we do a DIO overwrite or not */
+		if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+		    !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+			struct ext4_map_blocks map;
+			unsigned int blkbits = inode->i_blkbits;
+			int err, len;
+
+			map.m_lblk = pos >> blkbits;
+			map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+				- map.m_lblk;
+			len = map.m_len;
 
-		if (pos + length > sbi->s_bitmap_maxbytes) {
-			nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
-					      sbi->s_bitmap_maxbytes - pos);
+			err = ext4_map_blocks(NULL, inode, &map, 0);
+			/*
+			 * 'err==len' means that all of blocks has
+			 * been preallocated no matter they are
+			 * initialized or not.  For excluding
+			 * unwritten extents, we need to check
+			 * m_flags.  There are two conditions that
+			 * indicate for initialized extents.  1) If we
+			 * hit extent cache, EXT4_MAP_MAPPED flag is
+			 * returned; 2) If we do a real lookup,
+			 * non-flags are returned.  So we should check
+			 * these two conditions.
+			 */
+			if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+				overwrite = 1;
 		}
 	}
 
-	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-	/*
-	 * Skip flushing if there was an error, or if nothing was written.
-	 */
-	if (ret <= 0)
-		return ret;
+	ret = __generic_file_write_iter(iocb, from);
+	mutex_unlock(&inode->i_mutex);
+
+	if (ret > 0) {
+		ssize_t err;
+
+		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		if (err < 0)
+			ret = err;
+	}
+	if (o_direct)
+		blk_finish_plug(&plug);
+
+errout:
+	if (aio_mutex)
+		mutex_unlock(aio_mutex);
+	return ret;
+}
 
+static const struct vm_operations_struct ext4_file_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite   = ext4_page_mkwrite,
+	.remap_pages	= generic_file_remap_pages,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ext4_file_vm_ops;
+	return 0;
+}
+
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct vfsmount *mnt = filp->f_path.mnt;
+	struct path path;
+	char buf[64], *cp;
+
+	if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+		     !(sb->s_flags & MS_RDONLY))) {
+		sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+		/*
+		 * Sample where the filesystem has been mounted and
+		 * store it in the superblock for sysadmin convenience
+		 * when trying to sort through large numbers of block
+		 * devices or filesystem images.
+		 */
+		memset(buf, 0, sizeof(buf));
+		path.mnt = mnt;
+		path.dentry = mnt->mnt_root;
+		cp = d_path(&path, buf, sizeof(buf));
+		if (!IS_ERR(cp)) {
+			handle_t *handle;
+			int err;
+
+			handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+			if (IS_ERR(handle))
+				return PTR_ERR(handle);
+			BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+			err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+			if (err) {
+				ext4_journal_stop(handle);
+				return err;
+			}
+			strlcpy(sbi->s_es->s_last_mounted, cp,
+				sizeof(sbi->s_es->s_last_mounted));
+			ext4_handle_dirty_super(handle, sb);
+			ext4_journal_stop(handle);
+		}
+	}
 	/*
-	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-	 * journalling then we need to make sure that we force the transaction
-	 * to disk to keep all metadata uptodate synchronously.
+	 * Set up the jbd2_inode if we are opening the inode for
+	 * writing and the journal is present
 	 */
-	if (file->f_flags & O_SYNC) {
+	if (filp->f_mode & FMODE_WRITE) {
+		int ret = ext4_inode_attach_jinode(inode);
+		if (ret < 0)
+			return ret;
+	}
+	return dquot_file_open(inode, filp);
+}
+
+/*
+ * Here we use ext4_map_blocks() to get a block mapping for a extent-based
+ * file rather than ext4_ext_walk_space() because we can introduce
+ * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
+ * function.  When extent status tree has been fully implemented, it will
+ * track all extent status for a file and we can directly use it to
+ * retrieve the offset for SEEK_DATA/SEEK_HOLE.
+ */
+
+/*
+ * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
+ * lookup page cache to check whether or not there has some data between
+ * [startoff, endoff] because, if this range contains an unwritten extent,
+ * we determine this extent as a data or a hole according to whether the
+ * page cache has data or not.
+ */
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+				     int whence,
+				     struct ext4_map_blocks *map,
+				     loff_t *offset)
+{
+	struct pagevec pvec;
+	unsigned int blkbits;
+	pgoff_t index;
+	pgoff_t end;
+	loff_t endoff;
+	loff_t startoff;
+	loff_t lastoff;
+	int found = 0;
+
+	blkbits = inode->i_sb->s_blocksize_bits;
+	startoff = *offset;
+	lastoff = startoff;
+	endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+
+	index = startoff >> PAGE_CACHE_SHIFT;
+	end = endoff >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+	do {
+		int i, num;
+		unsigned long nr_pages;
+
+		num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+					  (pgoff_t)num);
+		if (nr_pages == 0) {
+			if (whence == SEEK_DATA)
+				break;
+
+			BUG_ON(whence != SEEK_HOLE);
+			/*
+			 * If this is the first time to go into the loop and
+			 * offset is not beyond the end offset, it will be a
+			 * hole at this offset
+			 */
+			if (lastoff == startoff || lastoff < endoff)
+				found = 1;
+			break;
+		}
+
 		/*
-		 * If we are non-data-journaled, then the dirty data has
-		 * already been flushed to backing store by generic_osync_inode,
-		 * and the inode has been flushed too if there have been any
-		 * modifications other than mere timestamp updates.
-		 *
-		 * Open question --- do we care about flushing timestamps too
-		 * if the inode is IS_SYNC?
+		 * If this is the first time to go into the loop and
+		 * offset is smaller than the first page offset, it will be a
+		 * hole at this offset.
 		 */
-		if (!ext4_should_journal_data(inode))
-			return ret;
+		if (lastoff == startoff && whence == SEEK_HOLE &&
+		    lastoff < page_offset(pvec.pages[0])) {
+			found = 1;
+			break;
+		}
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			/*
+			 * If the current offset is not beyond the end of given
+			 * range, it will be a hole.
+			 */
+			if (lastoff < endoff && whence == SEEK_HOLE &&
+			    page->index > end) {
+				found = 1;
+				*offset = lastoff;
+				goto out;
+			}
 
-		goto force_commit;
+			lock_page(page);
+
+			if (unlikely(page->mapping != inode->i_mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!page_has_buffers(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (page_has_buffers(page)) {
+				lastoff = page_offset(page);
+				bh = head = page_buffers(page);
+				do {
+					if (buffer_uptodate(bh) ||
+					    buffer_unwritten(bh)) {
+						if (whence == SEEK_DATA)
+							found = 1;
+					} else {
+						if (whence == SEEK_HOLE)
+							found = 1;
+					}
+					if (found) {
+						*offset = max_t(loff_t,
+							startoff, lastoff);
+						unlock_page(page);
+						goto out;
+					}
+					lastoff += bh->b_size;
+					bh = bh->b_this_page;
+				} while (bh != head);
+			}
+
+			lastoff = page_offset(page) + PAGE_SIZE;
+			unlock_page(page);
+		}
+
+		/*
+		 * The no. of pages is less than our desired, that would be a
+		 * hole in there.
+		 */
+		if (nr_pages < num && whence == SEEK_HOLE) {
+			found = 1;
+			*offset = lastoff;
+			break;
+		}
+
+		index = pvec.pages[i - 1]->index + 1;
+		pagevec_release(&pvec);
+	} while (index <= end);
+
+out:
+	pagevec_release(&pvec);
+	return found;
+}
+
+/*
+ * ext4_seek_data() retrieves the offset for SEEK_DATA.
+ */
+static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_map_blocks map;
+	struct extent_status es;
+	ext4_lblk_t start, last, end;
+	loff_t dataoff, isize;
+	int blkbits;
+	int ret = 0;
+
+	mutex_lock(&inode->i_mutex);
+
+	isize = i_size_read(inode);
+	if (offset >= isize) {
+		mutex_unlock(&inode->i_mutex);
+		return -ENXIO;
 	}
 
-	/*
-	 * So we know that there has been no forced data flush.  If the inode
-	 * is marked IS_SYNC, we need to force one ourselves.
-	 */
-	if (!IS_SYNC(inode))
-		return ret;
+	blkbits = inode->i_sb->s_blocksize_bits;
+	start = offset >> blkbits;
+	last = start;
+	end = isize >> blkbits;
+	dataoff = offset;
 
-	/*
-	 * Open question #2 --- should we force data to disk here too?  If we
-	 * don't, the only impact is that data=writeback filesystems won't
-	 * flush data to disk automatically on IS_SYNC, only metadata (but
-	 * historically, that is what ext2 has done.)
-	 */
+	do {
+		map.m_lblk = last;
+		map.m_len = end - last + 1;
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+			if (last != start)
+				dataoff = (loff_t)last << blkbits;
+			break;
+		}
 
-force_commit:
-	err = ext4_force_commit(inode->i_sb);
-	if (err)
-		return err;
-	return ret;
+		/*
+		 * If there is a delay extent at this offset,
+		 * it will be as a data.
+		 */
+		ext4_es_find_delayed_extent_range(inode, last, last, &es);
+		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+			if (last != start)
+				dataoff = (loff_t)last << blkbits;
+			break;
+		}
+
+		/*
+		 * If there is a unwritten extent at this offset,
+		 * it will be as a data or a hole according to page
+		 * cache that has data or not.
+		 */
+		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+			int unwritten;
+			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+							      &map, &dataoff);
+			if (unwritten)
+				break;
+		}
+
+		last++;
+		dataoff = (loff_t)last << blkbits;
+	} while (last <= end);
+
+	mutex_unlock(&inode->i_mutex);
+
+	if (dataoff > isize)
+		return -ENXIO;
+
+	return vfs_setpos(file, dataoff, maxsize);
+}
+
+/*
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ */
+static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_map_blocks map;
+	struct extent_status es;
+	ext4_lblk_t start, last, end;
+	loff_t holeoff, isize;
+	int blkbits;
+	int ret = 0;
+
+	mutex_lock(&inode->i_mutex);
+
+	isize = i_size_read(inode);
+	if (offset >= isize) {
+		mutex_unlock(&inode->i_mutex);
+		return -ENXIO;
+	}
+
+	blkbits = inode->i_sb->s_blocksize_bits;
+	start = offset >> blkbits;
+	last = start;
+	end = isize >> blkbits;
+	holeoff = offset;
+
+	do {
+		map.m_lblk = last;
+		map.m_len = end - last + 1;
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+			last += ret;
+			holeoff = (loff_t)last << blkbits;
+			continue;
+		}
+
+		/*
+		 * If there is a delay extent at this offset,
+		 * we will skip this extent.
+		 */
+		ext4_es_find_delayed_extent_range(inode, last, last, &es);
+		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+			last = es.es_lblk + es.es_len;
+			holeoff = (loff_t)last << blkbits;
+			continue;
+		}
+
+		/*
+		 * If there is a unwritten extent at this offset,
+		 * it will be as a data or a hole according to page
+		 * cache that has data or not.
+		 */
+		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+			int unwritten;
+			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+							      &map, &holeoff);
+			if (!unwritten) {
+				last += ret;
+				holeoff = (loff_t)last << blkbits;
+				continue;
+			}
+		}
+
+		/* find a hole */
+		break;
+	} while (last <= end);
+
+	mutex_unlock(&inode->i_mutex);
+
+	if (holeoff > isize)
+		holeoff = isize;
+
+	return vfs_setpos(file, holeoff, maxsize);
+}
+
+/*
+ * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
+ * by calling generic_file_llseek_size() with the appropriate maxbytes
+ * value for each.
+ */
+loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	loff_t maxbytes;
+
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+	else
+		maxbytes = inode->i_sb->s_maxbytes;
+
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+	case SEEK_END:
+		return generic_file_llseek_size(file, offset, whence,
+						maxbytes, i_size_read(inode));
+	case SEEK_DATA:
+		return ext4_seek_data(file, offset, maxbytes);
+	case SEEK_HOLE:
+		return ext4_seek_hole(file, offset, maxbytes);
+	}
+
+	return -EINVAL;
 }
 
 const struct file_operations ext4_file_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.aio_read	= generic_file_aio_read,
-	.aio_write	= ext4_file_write,
-	.ioctl		= ext4_ioctl,
+	.llseek		= ext4_llseek,
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= ext4_file_write_iter,
+	.unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
-	.mmap		= generic_file_mmap,
-	.open		= generic_file_open,
+	.mmap		= ext4_file_mmap,
+	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
+	.fallocate	= ext4_fallocate,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
-	.truncate	= ext4_truncate,
 	.setattr	= ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+	.getattr	= ext4_getattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
-#endif
-	.permission	= ext4_permission,
-	.fallocate	= ext4_fallocate,
+	.get_acl	= ext4_get_acl,
+	.set_acl	= ext4_set_acl,
+	.fiemap		= ext4_fiemap,
 };
 
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 8d50879d1c2..a8bc47f75fa 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,8 +27,51 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
+#include <linux/blkdev.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static int ext4_sync_parent(struct inode *inode)
+{
+	struct dentry *dentry = NULL;
+	struct inode *next;
+	int ret = 0;
+
+	if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
+		return 0;
+	inode = igrab(inode);
+	while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+		dentry = d_find_any_alias(inode);
+		if (!dentry)
+			break;
+		next = igrab(dentry->d_parent->d_inode);
+		dput(dentry);
+		if (!next)
+			break;
+		iput(inode);
+		inode = next;
+		ret = sync_mapping_buffers(inode->i_mapping);
+		if (ret)
+			break;
+		ret = sync_inode_metadata(inode, 1);
+		if (ret)
+			break;
+	}
+	iput(inode);
+	return ret;
+}
 
 /*
  * akpm: A new design for ext4_sync_file().
@@ -42,22 +85,42 @@
  * inode to disk.
  */
 
-int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
-	struct inode *inode = dentry->d_inode;
-	int ret = 0;
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	int ret = 0, err;
+	tid_t commit_tid;
+	bool needs_barrier = false;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
+	trace_ext4_sync_file_enter(file, datasync);
+
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		/* Make sure that we read updated s_mount_flags value */
+		smp_rmb();
+		if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			ret = -EROFS;
+		goto out;
+	}
+
+	if (!journal) {
+		ret = generic_file_fsync(file, start, end, datasync);
+		if (!ret && !hlist_empty(&inode->i_dentry))
+			ret = ext4_sync_parent(inode);
+		goto out;
+	}
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (ret)
+		return ret;
 	/*
-	 * data=writeback:
+	 * data=writeback,ordered:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-	 *  sync_inode() will sync the metadata
-	 *
-	 * data=ordered:
-	 *  The caller's filemap_fdatawrite() will write the data and
-	 *  sync_inode() will write the inode if it is dirty.  Then the caller's
-	 *  filemap_fdatawait() will wait on the pages.
+	 *  Metadata is in the journal, we wait for proper transaction to
+	 *  commit here.
 	 *
 	 * data=journal:
 	 *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -72,17 +135,17 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	/*
-	 * The VFS has written the file data.  If the inode is unaltered
-	 * then we need not start a commit.
-	 */
-	if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL,
-			.nr_to_write = 0, /* sys_fsync did this */
-		};
-		ret = sync_inode(inode, &wbc);
+	commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+	if (journal->j_flags & JBD2_BARRIER &&
+	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+		needs_barrier = true;
+	ret = jbd2_complete_transaction(journal, commit_tid);
+	if (needs_barrier) {
+		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+		if (!ret)
+			ret = err;
 	}
 out:
+	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index 7eb0604e7ee..00000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-				   struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
-				       struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
-				      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)			\
-		ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1555024e3b3..3d586f02883 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -11,8 +11,8 @@
 
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/cryptohash.h>
+#include "ext4.h"
 
 #define DELTA 0x9E3779B9
 
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 		sum += DELTA;
 		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
 		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
-	} while(--n);
+	} while (--n);
 
 	buf[0] += b0;
 	buf[1] += b1;
@@ -35,23 +35,43 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-	__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const unsigned char *ucp = (const unsigned char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const signed char *scp = (const signed char *) name;
+
 	while (len--) {
-		__u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
 
-		if (hash & 0x80000000) hash -= 0x7fffffff;
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
 		hash1 = hash0;
 		hash0 = hash;
 	}
-	return (hash0 << 1);
+	return hash0 << 1;
 }
 
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 {
 	__u32	pad, val;
 	int	i;
+	const signed char *scp = (const signed char *) msg;
 
 	pad = (__u32)len | ((__u32)len << 8);
 	pad |= pad << 16;
@@ -59,10 +79,38 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 	val = pad;
 	if (len > num*4)
 		len = num * 4;
-	for (i=0; i < len; i++) {
+	for (i = 0; i < len; i++) {
 		if ((i % 4) == 0)
 			val = pad;
-		val = msg[i] + (val << 8);
+		val = ((int) scp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const unsigned char *ucp = (const unsigned char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) ucp[i]) + (val << 8);
 		if ((i % 4) == 3) {
 			*buf++ = val;
 			val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	const char	*p;
 	int		i;
 	__u32		in[8], buf[4];
+	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
+				str2hashbuf_signed;
 
 	/* Initialize the default seed for the hash checksum functions */
 	buf[0] = 0x67452301;
@@ -104,22 +154,27 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 
 	/* Check to see if the seed is all zero's */
 	if (hinfo->seed) {
-		for (i=0; i < 4; i++) {
-			if (hinfo->seed[i])
+		for (i = 0; i < 4; i++) {
+			if (hinfo->seed[i]) {
+				memcpy(buf, hinfo->seed, sizeof(buf));
 				break;
+			}
 		}
-		if (i < 4)
-			memcpy(buf, hinfo->seed, sizeof(buf));
 	}
 
 	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY_UNSIGNED:
+		hash = dx_hack_hash_unsigned(name, len);
+		break;
 	case DX_HASH_LEGACY:
-		hash = dx_hack_hash(name, len);
+		hash = dx_hack_hash_signed(name, len);
 		break;
+	case DX_HASH_HALF_MD4_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_HALF_MD4:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 8);
+			(*str2hashbuf)(p, len, in, 8);
 			half_md4_transform(buf, in);
 			len -= 32;
 			p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 		minor_hash = buf[2];
 		hash = buf[1];
 		break;
+	case DX_HASH_TEA_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_TEA:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 4);
+			(*str2hashbuf)(p, len, in, 4);
 			TEA_transform(buf, in);
 			len -= 16;
 			p += 16;
@@ -143,8 +200,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 		return -1;
 	}
 	hash = hash & ~1;
-	if (hash == (EXT4_HTREE_EOF << 1))
-		hash = (EXT4_HTREE_EOF-1) << 1;
+	if (hash == (EXT4_HTREE_EOF_32BIT << 1))
+		hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
 	hinfo->hash = hash;
 	hinfo->minor_hash = minor_hash;
 	return 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8036b9b5376..5b87fc36aab 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -15,8 +15,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/quotaops.h>
@@ -26,9 +24,12 @@
 #include <linux/blkdev.h>
 #include <asm/byteorder.h>
 
+#include "ext4.h"
+#include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
+
+#include <trace/events/ext4.h>
 
 /*
  * ialloc.c contains the inodes allocation and deallocation routines
@@ -49,7 +50,7 @@
  * need to use it within a single byte (to ensure we get endianness right).
  * We can use memset for the rest of the bitmap as there are no other users.
  */
-void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 {
 	int i;
 
@@ -64,33 +65,54 @@ void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
 }
 
 /* Initializes an uninitialized inode bitmap */
-unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
-				ext4_group_t block_group,
-				struct ext4_group_desc *gdp)
+static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t block_group,
+				       struct ext4_group_desc *gdp)
 {
+	struct ext4_group_info *grp;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
 	J_ASSERT_BH(bh, buffer_locked(bh));
 
 	/* If checksum is bad mark all blocks and inodes use to prevent
 	 * allocation, essentially implementing a per-group read-only flag. */
-	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-		ext4_error(sb, __FUNCTION__, "Checksum bad for group %lu\n",
-			   block_group);
-		gdp->bg_free_blocks_count = 0;
-		gdp->bg_free_inodes_count = 0;
-		gdp->bg_itable_unused = 0;
-		memset(bh->b_data, 0xff, sb->s_blocksize);
+	if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+		ext4_error(sb, "Checksum bad for group %u", block_group);
+		grp = ext4_get_group_info(sb, block_group);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			int count;
+			count = ext4_free_inodes_count(sb, gdp);
+			percpu_counter_sub(&sbi->s_freeinodes_counter,
+					   count);
+		}
+		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
 		return 0;
 	}
 
 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+	ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 			bh->b_data);
+	ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+				   EXT4_INODES_PER_GROUP(sb) / 8);
+	ext4_group_desc_csum_set(sb, block_group, gdp);
 
 	return EXT4_INODES_PER_GROUP(sb);
 }
 
+void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+		set_bitmap_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
 /*
  * Read the inode allocation bitmap for a given block_group, reading
  * into the specified slot in the superblock's bitmap cache.
@@ -98,34 +120,93 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
  * Return buffer_head of bitmap on success or NULL.
  */
 static struct buffer_head *
-read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
 	struct ext4_group_desc *desc;
 	struct buffer_head *bh = NULL;
+	ext4_fsblk_t bitmap_blk;
+	struct ext4_group_info *grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	desc = ext4_get_group_desc(sb, block_group, NULL);
 	if (!desc)
-		goto error_out;
+		return NULL;
+
+	bitmap_blk = ext4_inode_bitmap(sb, desc);
+	bh = sb_getblk(sb, bitmap_blk);
+	if (unlikely(!bh)) {
+		ext4_error(sb, "Cannot read inode bitmap - "
+			    "block_group = %u, inode_bitmap = %llu",
+			    block_group, bitmap_blk);
+		return NULL;
+	}
+	if (bitmap_uptodate(bh))
+		goto verify;
+
+	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto verify;
+	}
+
+	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-		bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
-		if (!buffer_uptodate(bh)) {
-			lock_buffer(bh);
-			if (!buffer_uptodate(bh)) {
-				ext4_init_inode_bitmap(sb, bh, block_group,
-						       desc);
-				set_buffer_uptodate(bh);
-			}
-			unlock_buffer(bh);
+		ext4_init_inode_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
+		set_buffer_uptodate(bh);
+		set_buffer_verified(bh);
+		ext4_unlock_group(sb, block_group);
+		unlock_buffer(bh);
+		return bh;
+	}
+	ext4_unlock_group(sb, block_group);
+
+	if (buffer_uptodate(bh)) {
+		/*
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
+		 */
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		goto verify;
+	}
+	/*
+	 * submit the buffer_head for reading
+	 */
+	trace_ext4_load_inode_bitmap(sb, block_group);
+	bh->b_end_io = ext4_end_bitmap_read;
+	get_bh(bh);
+	submit_bh(READ | REQ_META | REQ_PRIO, bh);
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh)) {
+		put_bh(bh);
+		ext4_error(sb, "Cannot read inode bitmap - "
+			   "block_group = %u, inode_bitmap = %llu",
+			   block_group, bitmap_blk);
+		return NULL;
+	}
+
+verify:
+	ext4_lock_group(sb, block_group);
+	if (!buffer_verified(bh) &&
+	    !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+					   EXT4_INODES_PER_GROUP(sb) / 8)) {
+		ext4_unlock_group(sb, block_group);
+		put_bh(bh);
+		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+			   "inode_bitmap = %llu", block_group, bitmap_blk);
+		grp = ext4_get_group_info(sb, block_group);
+		if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			int count;
+			count = ext4_free_inodes_count(sb, desc);
+			percpu_counter_sub(&sbi->s_freeinodes_counter,
+					   count);
 		}
-	} else {
-		bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
-	}
-	if (!bh)
-		ext4_error(sb, "read_inode_bitmap",
-			    "Cannot read inode bitmap - "
-			    "block_group = %lu, inode_bitmap = %llu",
-			    block_group, ext4_inode_bitmap(sb, desc));
-error_out:
+		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+		return NULL;
+	}
+	ext4_unlock_group(sb, block_group);
+	set_buffer_verified(bh);
 	return bh;
 }
 
@@ -145,63 +226,68 @@ error_out:
  * though), and then we'd have two inodes sharing the
  * same inode number and space on the harddisk.
  */
-void ext4_free_inode (handle_t *handle, struct inode * inode)
+void ext4_free_inode(handle_t *handle, struct inode *inode)
 {
-	struct super_block * sb = inode->i_sb;
+	struct super_block *sb = inode->i_sb;
 	int is_directory;
 	unsigned long ino;
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *bh2;
 	ext4_group_t block_group;
 	unsigned long bit;
-	struct ext4_group_desc * gdp;
-	struct ext4_super_block * es;
+	struct ext4_group_desc *gdp;
+	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
-	int fatal = 0, err;
+	int fatal = 0, err, count, cleared;
+	struct ext4_group_info *grp;
 
-	if (atomic_read(&inode->i_count) > 1) {
-		printk ("ext4_free_inode: inode has count=%d\n",
-					atomic_read(&inode->i_count));
+	if (!sb) {
+		printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
+		       "nonexistent device\n", __func__, __LINE__);
 		return;
 	}
-	if (inode->i_nlink) {
-		printk ("ext4_free_inode: inode has nlink=%d\n",
-			inode->i_nlink);
+	if (atomic_read(&inode->i_count) > 1) {
+		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
+			 __func__, __LINE__, inode->i_ino,
+			 atomic_read(&inode->i_count));
 		return;
 	}
-	if (!sb) {
-		printk("ext4_free_inode: inode on nonexistent device\n");
+	if (inode->i_nlink) {
+		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
+			 __func__, __LINE__, inode->i_ino, inode->i_nlink);
 		return;
 	}
 	sbi = EXT4_SB(sb);
 
 	ino = inode->i_ino;
-	ext4_debug ("freeing inode %lu\n", ino);
+	ext4_debug("freeing inode %lu\n", ino);
+	trace_ext4_free_inode(inode);
 
 	/*
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	DQUOT_INIT(inode);
+	dquot_initialize(inode);
 	ext4_xattr_delete_inode(handle, inode);
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+	dquot_free_inode(inode);
+	dquot_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
 	/* Do this BEFORE marking the inode not in use or returning an error */
-	clear_inode (inode);
+	ext4_clear_inode(inode);
 
 	es = EXT4_SB(sb)->s_es;
 	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
-		ext4_error (sb, "ext4_free_inode",
-			    "reserved or nonexistent inode %lu", ino);
+		ext4_error(sb, "reserved or nonexistent inode %lu", ino);
 		goto error_return;
 	}
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
-	bitmap_bh = read_inode_bitmap(sb, block_group);
-	if (!bitmap_bh)
+	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
+	/* Don't bother if the inode bitmap is corrupt. */
+	grp = ext4_get_group_info(sb, block_group);
+	if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
 		goto error_return;
 
 	BUFFER_TRACE(bitmap_bh, "get_write_access");
@@ -209,84 +295,97 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	if (fatal)
 		goto error_return;
 
-	/* Ok, now we can actually update the inode bitmaps.. */
-	if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
-					bit, bitmap_bh->b_data))
-		ext4_error (sb, "ext4_free_inode",
-			      "bit already cleared for inode %lu", ino);
-	else {
-		gdp = ext4_get_group_desc (sb, block_group, &bh2);
-
+	fatal = -ESRCH;
+	gdp = ext4_get_group_desc(sb, block_group, &bh2);
+	if (gdp) {
 		BUFFER_TRACE(bh2, "get_write_access");
 		fatal = ext4_journal_get_write_access(handle, bh2);
-		if (fatal) goto error_return;
-
-		if (gdp) {
-			spin_lock(sb_bgl_lock(sbi, block_group));
-			gdp->bg_free_inodes_count = cpu_to_le16(
-				le16_to_cpu(gdp->bg_free_inodes_count) + 1);
-			if (is_directory)
-				gdp->bg_used_dirs_count = cpu_to_le16(
-				  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
-			gdp->bg_checksum = ext4_group_desc_csum(sbi,
-							block_group, gdp);
-			spin_unlock(sb_bgl_lock(sbi, block_group));
-			percpu_counter_inc(&sbi->s_freeinodes_counter);
-			if (is_directory)
-				percpu_counter_dec(&sbi->s_dirs_counter);
+	}
+	ext4_lock_group(sb, block_group);
+	cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
+	if (fatal || !cleared) {
+		ext4_unlock_group(sb, block_group);
+		goto out;
+	}
 
+	count = ext4_free_inodes_count(sb, gdp) + 1;
+	ext4_free_inodes_set(sb, gdp, count);
+	if (is_directory) {
+		count = ext4_used_dirs_count(sb, gdp) - 1;
+		ext4_used_dirs_set(sb, gdp, count);
+		percpu_counter_dec(&sbi->s_dirs_counter);
+	}
+	ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+				   EXT4_INODES_PER_GROUP(sb) / 8);
+	ext4_group_desc_csum_set(sb, block_group, gdp);
+	ext4_unlock_group(sb, block_group);
+
+	percpu_counter_inc(&sbi->s_freeinodes_counter);
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		if (is_directory)
+			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+	}
+	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+	if (cleared) {
+		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		if (!fatal)
+			fatal = err;
+	} else {
+		ext4_error(sb, "bit already cleared for inode %lu", ino);
+		if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			int count;
+			count = ext4_free_inodes_count(sb, gdp);
+			percpu_counter_sub(&sbi->s_freeinodes_counter,
+					   count);
 		}
-		BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-		err = ext4_journal_dirty_metadata(handle, bh2);
-		if (!fatal) fatal = err;
-	}
-	BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
-	if (!fatal)
-		fatal = err;
-	sb->s_dirt = 1;
+		set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+	}
+
 error_return:
 	brelse(bitmap_bh);
 	ext4_std_error(sb, fatal);
 }
 
+struct orlov_stats {
+	__u64 free_clusters;
+	__u32 free_inodes;
+	__u32 used_dirs;
+};
+
 /*
- * There are two policies for allocating an inode.  If the new inode is
- * a directory, then a forward search is made for a block group with both
- * free space and a low directory-to-inode ratio; if that fails, then of
- * the groups with above-average free space, that group with the fewest
- * directories already is chosen.
- *
- * For other inodes, search forward from the parent directory\'s block
- * group to find a free inode.
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg.  If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
  */
-static int find_group_dir(struct super_block *sb, struct inode *parent,
-				ext4_group_t *best_group)
+static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+			    int flex_size, struct orlov_stats *stats)
 {
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
-	unsigned int freei, avefreei;
-	struct ext4_group_desc *desc, *best_desc = NULL;
-	ext4_group_t group;
-	int ret = -1;
+	struct ext4_group_desc *desc;
+	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
 
-	freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
-	avefreei = freei / ngroups;
+	if (flex_size > 1) {
+		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
+		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+		return;
+	}
 
-	for (group = 0; group < ngroups; group++) {
-		desc = ext4_get_group_desc (sb, group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
-			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
-			continue;
-		if (!best_desc ||
-		    (le16_to_cpu(desc->bg_free_blocks_count) >
-		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
-			*best_group = group;
-			best_desc = desc;
-			ret = 0;
-		}
+	desc = ext4_get_group_desc(sb, g, NULL);
+	if (desc) {
+		stats->free_inodes = ext4_free_inodes_count(sb, desc);
+		stats->free_clusters = ext4_free_group_clusters(sb, desc);
+		stats->used_dirs = ext4_used_dirs_count(sb, desc);
+	} else {
+		stats->free_inodes = 0;
+		stats->free_clusters = 0;
+		stats->used_dirs = 0;
 	}
-	return ret;
 }
 
 /*
@@ -304,108 +403,146 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
  * it has too many directories already (max_dirs) or
  * it has too few free inodes left (min_inodes) or
  * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
- * Parent's group is prefered, if it doesn't satisfy these
+ * Parent's group is preferred, if it doesn't satisfy these
  * conditions we search cyclically through the rest. If none
  * of the groups look good we just look for a group with more
  * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
  */
 
-#define INODE_COST 64
-#define BLOCK_COST 256
-
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-				ext4_group_t *group)
+			    ext4_group_t *group, umode_t mode,
+			    const struct qstr *qstr)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_super_block *es = sbi->s_es;
-	ext4_group_t ngroups = sbi->s_groups_count;
+	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
-	unsigned int freei, avefreei;
-	ext4_fsblk_t freeb, avefreeb;
-	ext4_fsblk_t blocks_per_dir;
+	unsigned int freei, avefreei, grp_free;
+	ext4_fsblk_t freeb, avefreec;
 	unsigned int ndirs;
-	int max_debt, max_dirs, min_inodes;
-	ext4_grpblk_t min_blocks;
-	ext4_group_t i;
+	int max_dirs, min_inodes;
+	ext4_grpblk_t min_clusters;
+	ext4_group_t i, grp, g, ngroups;
 	struct ext4_group_desc *desc;
+	struct orlov_stats stats;
+	int flex_size = ext4_flex_bg_size(sbi);
+	struct dx_hash_info hinfo;
+
+	ngroups = real_ngroups;
+	if (flex_size > 1) {
+		ngroups = (real_ngroups + flex_size - 1) >>
+			sbi->s_log_groups_per_flex;
+		parent_group >>= sbi->s_log_groups_per_flex;
+	}
 
 	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
 	avefreei = freei / ngroups;
-	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	avefreeb = freeb;
-	do_div(avefreeb, ngroups);
+	freeb = EXT4_C2B(sbi,
+		percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+	avefreec = freeb;
+	do_div(avefreec, ngroups);
 	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
 
-	if ((parent == sb->s_root->d_inode) ||
-	    (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+	if (S_ISDIR(mode) &&
+	    ((parent == sb->s_root->d_inode) ||
+	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
 		int best_ndir = inodes_per_group;
-		ext4_group_t grp;
 		int ret = -1;
 
-		get_random_bytes(&grp, sizeof(grp));
+		if (qstr) {
+			hinfo.hash_version = DX_HASH_HALF_MD4;
+			hinfo.seed = sbi->s_hash_seed;
+			ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+			grp = hinfo.hash;
+		} else
+			grp = prandom_u32();
 		parent_group = (unsigned)grp % ngroups;
 		for (i = 0; i < ngroups; i++) {
-			grp = (parent_group + i) % ngroups;
-			desc = ext4_get_group_desc(sb, grp, NULL);
-			if (!desc || !desc->bg_free_inodes_count)
+			g = (parent_group + i) % ngroups;
+			get_orlov_stats(sb, g, flex_size, &stats);
+			if (!stats.free_inodes)
 				continue;
-			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+			if (stats.used_dirs >= best_ndir)
 				continue;
-			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+			if (stats.free_inodes < avefreei)
 				continue;
-			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+			if (stats.free_clusters < avefreec)
 				continue;
-			*group = grp;
+			grp = g;
 			ret = 0;
-			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+			best_ndir = stats.used_dirs;
+		}
+		if (ret)
+			goto fallback;
+	found_flex_bg:
+		if (flex_size == 1) {
+			*group = grp;
+			return 0;
+		}
+
+		/*
+		 * We pack inodes at the beginning of the flexgroup's
+		 * inode tables.  Block allocation decisions will do
+		 * something similar, although regular files will
+		 * start at 2nd block group of the flexgroup.  See
+		 * ext4_ext_find_goal() and ext4_find_near().
+		 */
+		grp *= flex_size;
+		for (i = 0; i < flex_size; i++) {
+			if (grp+i >= real_ngroups)
+				break;
+			desc = ext4_get_group_desc(sb, grp+i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = grp+i;
+				return 0;
+			}
 		}
-		if (ret == 0)
-			return ret;
 		goto fallback;
 	}
 
-	blocks_per_dir = ext4_blocks_count(es) - freeb;
-	do_div(blocks_per_dir, ndirs);
-
 	max_dirs = ndirs / ngroups + inodes_per_group / 16;
-	min_inodes = avefreei - inodes_per_group / 4;
-	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
-	max_debt = EXT4_BLOCKS_PER_GROUP(sb);
-	max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
-	if (max_debt * INODE_COST > inodes_per_group)
-		max_debt = inodes_per_group / INODE_COST;
-	if (max_debt > 255)
-		max_debt = 255;
-	if (max_debt == 0)
-		max_debt = 1;
+	min_inodes = avefreei - inodes_per_group*flex_size / 4;
+	if (min_inodes < 1)
+		min_inodes = 1;
+	min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
+
+	/*
+	 * Start looking in the flex group where we last allocated an
+	 * inode for this parent directory
+	 */
+	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+		parent_group = EXT4_I(parent)->i_last_alloc_group;
+		if (flex_size > 1)
+			parent_group >>= sbi->s_log_groups_per_flex;
+	}
 
 	for (i = 0; i < ngroups; i++) {
-		*group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
+		grp = (parent_group + i) % ngroups;
+		get_orlov_stats(sb, grp, flex_size, &stats);
+		if (stats.used_dirs >= max_dirs)
 			continue;
-		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+		if (stats.free_inodes < min_inodes)
 			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+		if (stats.free_clusters < min_clusters)
 			continue;
-		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
-			continue;
-		return 0;
+		goto found_flex_bg;
 	}
 
 fallback:
+	ngroups = real_ngroups;
+	avefreei = freei / ngroups;
+fallback_retry:
+	parent_group = EXT4_I(parent)->i_block_group;
 	for (i = 0; i < ngroups; i++) {
-		*group = (parent_group + i) % ngroups;
-		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (desc && desc->bg_free_inodes_count &&
-			le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
-			return 0;
+		grp = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc(sb, grp, NULL);
+		if (desc) {
+			grp_free = ext4_free_inodes_count(sb, desc);
+			if (grp_free && grp_free >= avefreei) {
+				*group = grp;
+				return 0;
+			}
+		}
 	}
 
 	if (avefreei) {
@@ -414,27 +551,65 @@ fallback:
 		 * filesystems the above test can fail to find any blockgroups
 		 */
 		avefreei = 0;
-		goto fallback;
+		goto fallback_retry;
 	}
 
 	return -1;
 }
 
 static int find_group_other(struct super_block *sb, struct inode *parent,
-				ext4_group_t *group)
+			    ext4_group_t *group, umode_t mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
-	ext4_group_t i;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+	/*
+	 * Try to place the inode is the same flex group as its
+	 * parent.  If we can't find space, use the Orlov algorithm to
+	 * find another flex group, and store that information in the
+	 * parent directory's inode information so that use that flex
+	 * group for future allocations.
+	 */
+	if (flex_size > 1) {
+		int retry = 0;
+
+	try_again:
+		parent_group &= ~(flex_size-1);
+		last = parent_group + flex_size;
+		if (last > ngroups)
+			last = ngroups;
+		for  (i = parent_group; i < last; i++) {
+			desc = ext4_get_group_desc(sb, i, NULL);
+			if (desc && ext4_free_inodes_count(sb, desc)) {
+				*group = i;
+				return 0;
+			}
+		}
+		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+			retry = 1;
+			parent_group = EXT4_I(parent)->i_last_alloc_group;
+			goto try_again;
+		}
+		/*
+		 * If this didn't work, use the Orlov search algorithm
+		 * to find a new flex group; we pass in the mode to
+		 * avoid the topdir algorithms.
+		 */
+		*group = parent_group + flex_size;
+		if (*group > ngroups)
+			*group = 0;
+		return find_group_orlov(sb, parent, group, mode, NULL);
+	}
 
 	/*
 	 * Try to place the inode in its parent directory
 	 */
 	*group = parent_group;
 	desc = ext4_get_group_desc(sb, *group, NULL);
-	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-			le16_to_cpu(desc->bg_free_blocks_count))
+	if (desc && ext4_free_inodes_count(sb, desc) &&
+	    ext4_free_group_clusters(sb, desc))
 		return 0;
 
 	/*
@@ -457,8 +632,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 		if (*group >= ngroups)
 			*group -= ngroups;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-				le16_to_cpu(desc->bg_free_blocks_count))
+		if (desc && ext4_free_inodes_count(sb, desc) &&
+		    ext4_free_group_clusters(sb, desc))
 			return 0;
 	}
 
@@ -471,7 +646,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 		if (++*group >= ngroups)
 			*group = 0;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+		if (desc && ext4_free_inodes_count(sb, desc))
 			return 0;
 	}
 
@@ -479,6 +654,51 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 }
 
 /*
+ * In no journal mode, if an inode has recently been deleted, we want
+ * to avoid reusing it until we're reasonably sure the inode table
+ * block has been written back to disk.  (Yes, these values are
+ * somewhat arbitrary...)
+ */
+#define RECENTCY_MIN	5
+#define RECENTCY_DIRTY	30
+
+static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
+{
+	struct ext4_group_desc	*gdp;
+	struct ext4_inode	*raw_inode;
+	struct buffer_head	*bh;
+	unsigned long		dtime, now;
+	int	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	int	offset, ret = 0, recentcy = RECENTCY_MIN;
+
+	gdp = ext4_get_group_desc(sb, group, NULL);
+	if (unlikely(!gdp))
+		return 0;
+
+	bh = sb_getblk(sb, ext4_inode_table(sb, gdp) +
+		       (ino / inodes_per_block));
+	if (unlikely(!bh) || !buffer_uptodate(bh))
+		/*
+		 * If the block is not in the buffer cache, then it
+		 * must have been written out.
+		 */
+		goto out;
+
+	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
+	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
+	dtime = le32_to_cpu(raw_inode->i_dtime);
+	now = get_seconds();
+	if (buffer_dirty(bh))
+		recentcy += RECENTCY_DIRTY;
+
+	if (dtime && (dtime < now) && (now < dtime + recentcy))
+		ret = 1;
+out:
+	brelse(bh);
+	return ret;
+}
+
+/*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
  * free space and a low directory-to-inode ratio; if that fails, then of
@@ -488,209 +708,272 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
  * For other inodes, search forward from the parent directory's block
  * group to find a free inode.
  */
-struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
+			       umode_t mode, const struct qstr *qstr,
+			       __u32 goal, uid_t *owner, int handle_type,
+			       unsigned int line_no, int nblocks)
 {
 	struct super_block *sb;
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *bh2;
-	ext4_group_t group = 0;
+	struct buffer_head *inode_bitmap_bh = NULL;
+	struct buffer_head *group_desc_bh;
+	ext4_group_t ngroups, group = 0;
 	unsigned long ino = 0;
-	struct inode * inode;
-	struct ext4_group_desc * gdp = NULL;
-	struct ext4_super_block * es;
+	struct inode *inode;
+	struct ext4_group_desc *gdp = NULL;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
 	int ret2, err = 0;
 	struct inode *ret;
 	ext4_group_t i;
-	int free = 0;
+	ext4_group_t flex_group;
+	struct ext4_group_info *grp;
 
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
 		return ERR_PTR(-EPERM);
 
 	sb = dir->i_sb;
+	ngroups = ext4_get_groups_count(sb);
+	trace_ext4_request_inode(dir, mode);
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	ei = EXT4_I(inode);
-
 	sbi = EXT4_SB(sb);
-	es = sbi->s_es;
-	if (S_ISDIR(mode)) {
-		if (test_opt (sb, OLDALLOC))
-			ret2 = find_group_dir(sb, dir, &group);
-		else
-			ret2 = find_group_orlov(sb, dir, &group);
+
+	/*
+	 * Initalize owners and quota early so that we don't have to account
+	 * for quota initialization worst case in standard inode creating
+	 * transaction
+	 */
+	if (owner) {
+		inode->i_mode = mode;
+		i_uid_write(inode, owner[0]);
+		i_gid_write(inode, owner[1]);
+	} else if (test_opt(sb, GRPID)) {
+		inode->i_mode = mode;
+		inode->i_uid = current_fsuid();
+		inode->i_gid = dir->i_gid;
 	} else
-		ret2 = find_group_other(sb, dir, &group);
+		inode_init_owner(inode, dir, mode);
+	dquot_initialize(inode);
+
+	if (!goal)
+		goal = sbi->s_inode_goal;
+
+	if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
+		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+		ret2 = 0;
+		goto got_group;
+	}
 
+	if (S_ISDIR(mode))
+		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
+	else
+		ret2 = find_group_other(sb, dir, &group, mode);
+
+got_group:
+	EXT4_I(dir)->i_last_alloc_group = group;
 	err = -ENOSPC;
 	if (ret2 == -1)
 		goto out;
 
-	for (i = 0; i < sbi->s_groups_count; i++) {
+	/*
+	 * Normally we will only go through one pass of this loop,
+	 * unless we get unlucky and it turns out the group we selected
+	 * had its last inode grabbed by someone else.
+	 */
+	for (i = 0; i < ngroups; i++, ino = 0) {
 		err = -EIO;
 
-		gdp = ext4_get_group_desc(sb, group, &bh2);
+		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
 		if (!gdp)
-			goto fail;
+			goto out;
 
-		brelse(bitmap_bh);
-		bitmap_bh = read_inode_bitmap(sb, group);
-		if (!bitmap_bh)
-			goto fail;
+		/*
+		 * Check free inodes count before loading bitmap.
+		 */
+		if (ext4_free_inodes_count(sb, gdp) == 0) {
+			if (++group == ngroups)
+				group = 0;
+			continue;
+		}
+
+		grp = ext4_get_group_info(sb, group);
+		/* Skip groups with already-known suspicious inode tables */
+		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+			if (++group == ngroups)
+				group = 0;
+			continue;
+		}
 
-		ino = 0;
+		brelse(inode_bitmap_bh);
+		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+		/* Skip groups with suspicious inode tables */
+		if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+			if (++group == ngroups)
+				group = 0;
+			continue;
+		}
 
 repeat_in_this_group:
 		ino = ext4_find_next_zero_bit((unsigned long *)
-				bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
-		if (ino < EXT4_INODES_PER_GROUP(sb)) {
-
-			BUFFER_TRACE(bitmap_bh, "get_write_access");
-			err = ext4_journal_get_write_access(handle, bitmap_bh);
-			if (err)
-				goto fail;
-
-			if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
-						ino, bitmap_bh->b_data)) {
-				/* we won it */
-				BUFFER_TRACE(bitmap_bh,
-					"call ext4_journal_dirty_metadata");
-				err = ext4_journal_dirty_metadata(handle,
-								bitmap_bh);
-				if (err)
-					goto fail;
-				goto got;
+					      inode_bitmap_bh->b_data,
+					      EXT4_INODES_PER_GROUP(sb), ino);
+		if (ino >= EXT4_INODES_PER_GROUP(sb))
+			goto next_group;
+		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
+			ext4_error(sb, "reserved inode found cleared - "
+				   "inode=%lu", ino + 1);
+			continue;
+		}
+		if ((EXT4_SB(sb)->s_journal == NULL) &&
+		    recently_deleted(sb, group, ino)) {
+			ino++;
+			goto next_inode;
+		}
+		if (!handle) {
+			BUG_ON(nblocks <= 0);
+			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
+							 handle_type, nblocks,
+							 0);
+			if (IS_ERR(handle)) {
+				err = PTR_ERR(handle);
+				ext4_std_error(sb, err);
+				goto out;
 			}
-			/* we lost it */
-			jbd2_journal_release_buffer(handle, bitmap_bh);
-
-			if (++ino < EXT4_INODES_PER_GROUP(sb))
-				goto repeat_in_this_group;
 		}
-
-		/*
-		 * This case is possible in concurrent environment.  It is very
-		 * rare.  We cannot repeat the find_group_xxx() call because
-		 * that will simply return the same blockgroup, because the
-		 * group descriptor metadata has not yet been updated.
-		 * So we just go onto the next blockgroup.
-		 */
-		if (++group == sbi->s_groups_count)
+		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+		if (err) {
+			ext4_std_error(sb, err);
+			goto out;
+		}
+		ext4_lock_group(sb, group);
+		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+		ext4_unlock_group(sb, group);
+		ino++;		/* the inode bitmap is zero-based */
+		if (!ret2)
+			goto got; /* we grabbed the inode! */
+next_inode:
+		if (ino < EXT4_INODES_PER_GROUP(sb))
+			goto repeat_in_this_group;
+next_group:
+		if (++group == ngroups)
 			group = 0;
 	}
 	err = -ENOSPC;
 	goto out;
 
 got:
-	ino++;
-	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-	    ino > EXT4_INODES_PER_GROUP(sb)) {
-		ext4_error(sb, __FUNCTION__,
-			   "reserved inode or inode > inodes count - "
-			   "block_group = %lu, inode=%lu", group,
-			   ino + group * EXT4_INODES_PER_GROUP(sb));
-		err = -EIO;
-		goto fail;
+	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
 	}
 
-	BUFFER_TRACE(bh2, "get_write_access");
-	err = ext4_journal_get_write_access(handle, bh2);
-	if (err) goto fail;
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, group_desc_bh);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
+	}
 
 	/* We may have to initialize the block bitmap if it isn't already */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
+	if (ext4_has_group_desc_csum(sb) &&
 	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		struct buffer_head *block_bh = read_block_bitmap(sb, group);
+		struct buffer_head *block_bitmap_bh;
 
-		BUFFER_TRACE(block_bh, "get block bitmap access");
-		err = ext4_journal_get_write_access(handle, block_bh);
+		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 		if (err) {
-			brelse(block_bh);
-			goto fail;
+			brelse(block_bitmap_bh);
+			ext4_std_error(sb, err);
+			goto out;
 		}
 
-		free = 0;
-		spin_lock(sb_bgl_lock(sbi, group));
+		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+
 		/* recheck and clear flag under lock if we still need to */
+		ext4_lock_group(sb, group);
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-			free = ext4_free_blocks_after_init(sb, group, gdp);
-			gdp->bg_free_blocks_count = cpu_to_le16(free);
+			ext4_free_group_clusters_set(sb, gdp,
+				ext4_free_clusters_after_init(sb, group, gdp));
+			ext4_block_bitmap_csum_set(sb, group, gdp,
+						   block_bitmap_bh);
+			ext4_group_desc_csum_set(sb, group, gdp);
 		}
-		spin_unlock(sb_bgl_lock(sbi, group));
+		ext4_unlock_group(sb, group);
+		brelse(block_bitmap_bh);
 
-		/* Don't need to dirty bitmap block if we didn't change it */
-		if (free) {
-			BUFFER_TRACE(block_bh, "dirty block bitmap");
-			err = ext4_journal_dirty_metadata(handle, block_bh);
+		if (err) {
+			ext4_std_error(sb, err);
+			goto out;
 		}
-
-		brelse(block_bh);
-		if (err)
-			goto fail;
 	}
 
-	spin_lock(sb_bgl_lock(sbi, group));
-	/* If we didn't allocate from within the initialized part of the inode
-	 * table then we need to initialize up to this inode. */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+	/* Update the relevant bg descriptor fields */
+	if (ext4_has_group_desc_csum(sb)) {
+		int free;
+		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+		down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+		ext4_lock_group(sb, group); /* while we modify the bg desc */
+		free = EXT4_INODES_PER_GROUP(sb) -
+			ext4_itable_unused_count(sb, gdp);
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-
-			/* When marking the block group with
-			 * ~EXT4_BG_INODE_UNINIT we don't want to depend
-			 * on the value of bg_itable_unsed even though
-			 * mke2fs could have initialized the same for us.
-			 * Instead we calculated the value below
-			 */
-
 			free = 0;
-		} else {
-			free = EXT4_INODES_PER_GROUP(sb) -
-				le16_to_cpu(gdp->bg_itable_unused);
 		}
-
 		/*
 		 * Check the relative inode number against the last used
 		 * relative inode number in this group. if it is greater
-		 * we need to  update the bg_itable_unused count
-		 *
+		 * we need to update the bg_itable_unused count
 		 */
 		if (ino > free)
-			gdp->bg_itable_unused =
-				cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
+			ext4_itable_unused_set(sb, gdp,
+					(EXT4_INODES_PER_GROUP(sb) - ino));
+		up_read(&grp->alloc_sem);
+	} else {
+		ext4_lock_group(sb, group);
 	}
 
-	gdp->bg_free_inodes_count =
-		cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
 	if (S_ISDIR(mode)) {
-		gdp->bg_used_dirs_count =
-			cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+		if (sbi->s_log_groups_per_flex) {
+			ext4_group_t f = ext4_flex_group(sbi, group);
+
+			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+		}
+	}
+	if (ext4_has_group_desc_csum(sb)) {
+		ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+					   EXT4_INODES_PER_GROUP(sb) / 8);
+		ext4_group_desc_csum_set(sb, group, gdp);
+	}
+	ext4_unlock_group(sb, group);
+
+	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto out;
 	}
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, group));
-	BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-	err = ext4_journal_dirty_metadata(handle, bh2);
-	if (err) goto fail;
 
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
 		percpu_counter_inc(&sbi->s_dirs_counter);
-	sb->s_dirt = 1;
 
-	inode->i_uid = current->fsuid;
-	if (test_opt (sb, GRPID))
-		inode->i_gid = dir->i_gid;
-	else if (dir->i_mode & S_ISGID) {
-		inode->i_gid = dir->i_gid;
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		inode->i_gid = current->fsgid;
-	inode->i_mode = mode;
+	if (sbi->s_log_groups_per_flex) {
+		flex_group = ext4_flex_group(sbi, group);
+		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+	}
 
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
 	/* This is the optimal IO size (for stat), not the fs block size */
@@ -702,85 +985,99 @@ got:
 	ei->i_dir_start_lookup = 0;
 	ei->i_disksize = 0;
 
-	/*
-	 * Don't inherit extent flag from directory. We set extent flag on
-	 * newly created directory and file only if -o extent mount option is
-	 * specified
-	 */
-	ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
-	if (S_ISLNK(mode))
-		ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
-	/* dirsync only applies to directories */
-	if (!S_ISDIR(mode))
-		ei->i_flags &= ~EXT4_DIRSYNC_FL;
+	/* Don't inherit extent flag from directory, amongst others. */
+	ei->i_flags =
+		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
 	ei->i_file_acl = 0;
 	ei->i_dtime = 0;
-	ei->i_block_alloc_info = NULL;
 	ei->i_block_group = group;
+	ei->i_last_alloc_group = ~0;
 
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
-		handle->h_sync = 1;
-	insert_inode_hash(inode);
+		ext4_handle_sync(handle);
+	if (insert_inode_locked(inode) < 0) {
+		/*
+		 * Likely a bitmap corruption causing inode to be allocated
+		 * twice.
+		 */
+		err = -EIO;
+		ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
+			   inode->i_ino);
+		goto out;
+	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
 
-	ei->i_state = EXT4_STATE_NEW;
+	/* Precompute checksum seed for inode metadata */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+		__u32 csum;
+		__le32 inum = cpu_to_le32(inode->i_ino);
+		__le32 gen = cpu_to_le32(inode->i_generation);
+		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+				   sizeof(inum));
+		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+					      sizeof(gen));
+	}
+
+	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+	ext4_set_inode_state(inode, EXT4_STATE_NEW);
 
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
+	ei->i_inline_off = 0;
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
 	ret = inode;
-	if(DQUOT_ALLOC_INODE(inode)) {
-		err = -EDQUOT;
+	err = dquot_alloc_inode(inode);
+	if (err)
 		goto fail_drop;
-	}
 
 	err = ext4_init_acl(handle, inode, dir);
 	if (err)
 		goto fail_free_drop;
 
-	err = ext4_init_security(handle,inode, dir);
+	err = ext4_init_security(handle, inode, dir, qstr);
 	if (err)
 		goto fail_free_drop;
 
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+		/* set extent flag only for directory, file and normal symlink*/
+		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
+			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+			ext4_ext_tree_init(handle, inode);
+		}
+	}
+
+	if (ext4_handle_valid(handle)) {
+		ei->i_sync_tid = handle->h_transaction->t_tid;
+		ei->i_datasync_tid = handle->h_transaction->t_tid;
+	}
+
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_std_error(sb, err);
 		goto fail_free_drop;
 	}
-	if (test_opt(sb, EXTENTS)) {
-		/* set extent flag only for directory and file */
-		if (S_ISDIR(mode) || S_ISREG(mode)) {
-			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
-			ext4_ext_tree_init(handle, inode);
-			err = ext4_update_incompat_feature(handle, sb,
-					EXT4_FEATURE_INCOMPAT_EXTENTS);
-			if (err)
-				goto fail;
-		}
-	}
 
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
-	goto really_out;
-fail:
-	ext4_std_error(sb, err);
-out:
-	iput(inode);
-	ret = ERR_PTR(err);
-really_out:
-	brelse(bitmap_bh);
+	trace_ext4_allocate_inode(inode, dir, mode);
+	brelse(inode_bitmap_bh);
 	return ret;
 
 fail_free_drop:
-	DQUOT_FREE_INODE(inode);
-
+	dquot_free_inode(inode);
 fail_drop:
-	DQUOT_DROP(inode);
+	clear_nlink(inode);
+	unlock_new_inode(inode);
+out:
+	dquot_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
-	inode->i_nlink = 0;
 	iput(inode);
-	brelse(bitmap_bh);
+	brelse(inode_bitmap_bh);
 	return ERR_PTR(err);
 }
 
@@ -796,17 +1093,15 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 
 	/* Error cases - e2fsck has already cleaned up for us */
 	if (ino > max_ino) {
-		ext4_warning(sb, __FUNCTION__,
-			     "bad orphan ino %lu!  e2fsck was run?", ino);
+		ext4_warning(sb, "bad orphan ino %lu!  e2fsck was run?", ino);
 		goto error;
 	}
 
 	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
 	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
-	bitmap_bh = read_inode_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
 	if (!bitmap_bh) {
-		ext4_warning(sb, __FUNCTION__,
-			     "inode bitmap error for orphan %lu", ino);
+		ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
 		goto error;
 	}
 
@@ -821,6 +1116,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	if (IS_ERR(inode))
 		goto iget_failed;
 
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext4_can_truncate(inode))
+		goto bad_orphan;
+
 	if (NEXT_ORPHAN(inode) > max_ino)
 		goto bad_orphan;
 	brelse(bitmap_bh);
@@ -830,18 +1133,18 @@ iget_failed:
 	err = PTR_ERR(inode);
 	inode = NULL;
 bad_orphan:
-	ext4_warning(sb, __FUNCTION__,
-		     "bad orphan inode %lu!  e2fsck was run?", ino);
-	printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+	ext4_warning(sb, "bad orphan inode %lu!  e2fsck was run?", ino);
+	printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n",
 	       bit, (unsigned long long)bitmap_bh->b_blocknr,
 	       ext4_test_bit(bit, bitmap_bh->b_data));
-	printk(KERN_NOTICE "inode=%p\n", inode);
+	printk(KERN_WARNING "inode=%p\n", inode);
 	if (inode) {
-		printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+		printk(KERN_WARNING "is_bad_inode(inode)=%d\n",
 		       is_bad_inode(inode));
-		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+		printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n",
 		       NEXT_ORPHAN(inode));
-		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		printk(KERN_WARNING "max_ino=%lu\n", max_ino);
+		printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink);
 		/* Avoid freeing blocks if we got a bad deleted inode */
 		if (inode->i_nlink == 0)
 			inode->i_blocks = 0;
@@ -852,11 +1155,11 @@ error:
 	return ERR_PTR(err);
 }
 
-unsigned long ext4_count_free_inodes (struct super_block * sb)
+unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
 	unsigned long desc_count;
 	struct ext4_group_desc *gdp;
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	unsigned long bitmap_count, x;
@@ -866,32 +1169,34 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 	desc_count = 0;
 	bitmap_count = 0;
 	gdp = NULL;
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		gdp = ext4_get_group_desc (sb, i, NULL);
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		desc_count += ext4_free_inodes_count(sb, gdp);
 		brelse(bitmap_bh);
-		bitmap_bh = read_inode_bitmap(sb, i);
+		bitmap_bh = ext4_read_inode_bitmap(sb, i);
 		if (!bitmap_bh)
 			continue;
 
-		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+		x = ext4_count_free(bitmap_bh->b_data,
+				    EXT4_INODES_PER_GROUP(sb) / 8);
 		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-			i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
-		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+	printk(KERN_DEBUG "ext4_count_free_inodes: "
+	       "stored = %u, computed = %lu, %lu\n",
+	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
 	return desc_count;
 #else
 	desc_count = 0;
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		gdp = ext4_get_group_desc (sb, i, NULL);
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		desc_count += ext4_free_inodes_count(sb, gdp);
 		cond_resched();
 	}
 	return desc_count;
@@ -899,17 +1204,122 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
 }
 
 /* Called at mount-time, super-block is locked */
-unsigned long ext4_count_dirs (struct super_block * sb)
+unsigned long ext4_count_dirs(struct super_block * sb)
 {
 	unsigned long count = 0;
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
-		struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+	for (i = 0; i < ngroups; i++) {
+		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		count += le16_to_cpu(gdp->bg_used_dirs_count);
+		count += ext4_used_dirs_count(sb, gdp);
 	}
 	return count;
 }
 
+/*
+ * Zeroes not yet zeroed inode table - just write zeroes through the whole
+ * inode table. Must be called without any spinlock held. The only place
+ * where it is called from on active part of filesystem is ext4lazyinit
+ * thread, so we do not need any special locks, however we have to prevent
+ * inode allocation from the current group, so we take alloc_sem lock, to
+ * block ext4_new_inode() until we are finished.
+ */
+int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
+				 int barrier)
+{
+	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *group_desc_bh;
+	handle_t *handle;
+	ext4_fsblk_t blk;
+	int num, ret = 0, used_blks = 0;
+
+	/* This should not happen, but just to be sure check this */
+	if (sb->s_flags & MS_RDONLY) {
+		ret = 1;
+		goto out;
+	}
+
+	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
+	if (!gdp)
+		goto out;
+
+	/*
+	 * We do not need to lock this, because we are the only one
+	 * handling this flag.
+	 */
+	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
+		goto out;
+
+	handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	down_write(&grp->alloc_sem);
+	/*
+	 * If inode bitmap was already initialized there may be some
+	 * used inodes so we need to skip blocks with used inodes in
+	 * inode table.
+	 */
+	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+		used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
+			    ext4_itable_unused_count(sb, gdp)),
+			    sbi->s_inodes_per_block);
+
+	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
+		ext4_error(sb, "Something is wrong with group %u: "
+			   "used itable blocks: %d; "
+			   "itable unused count: %u",
+			   group, used_blks,
+			   ext4_itable_unused_count(sb, gdp));
+		ret = 1;
+		goto err_out;
+	}
+
+	blk = ext4_inode_table(sb, gdp) + used_blks;
+	num = sbi->s_itb_per_group - used_blks;
+
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	ret = ext4_journal_get_write_access(handle,
+					    group_desc_bh);
+	if (ret)
+		goto err_out;
+
+	/*
+	 * Skip zeroout if the inode table is full. But we set the ZEROED
+	 * flag anyway, because obviously, when it is full it does not need
+	 * further zeroing.
+	 */
+	if (unlikely(num == 0))
+		goto skip_zeroout;
+
+	ext4_debug("going to zero out inode table in group %d\n",
+		   group);
+	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
+	if (ret < 0)
+		goto err_out;
+	if (barrier)
+		blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL);
+
+skip_zeroout:
+	ext4_lock_group(sb, group);
+	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+	ext4_group_desc_csum_set(sb, group, gdp);
+	ext4_unlock_group(sb, group);
+
+	BUFFER_TRACE(group_desc_bh,
+		     "call ext4_handle_dirty_metadata");
+	ret = ext4_handle_dirty_metadata(handle, NULL,
+					 group_desc_bh);
+
+err_out:
+	up_write(&grp->alloc_sem);
+	ext4_journal_stop(handle);
+out:
+	return ret;
+}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
new file mode 100644
index 00000000000..fd69da19482
--- /dev/null
+++ b/fs/ext4/indirect.c
@@ -0,0 +1,1391 @@
+/*
+ *  linux/fs/ext4/indirect.c
+ *
+ *  from
+ *
+ *  linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ *	(sct@redhat.com), 1993, 1998
+ */
+
+#include <linux/aio.h>
+#include "ext4_jbd2.h"
+#include "truncate.h"
+
+#include <trace/events/ext4.h>
+
+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+/**
+ *	ext4_block_to_path - parse the block number into array of offsets
+ *	@inode: inode in question (we are only interested in its superblock)
+ *	@i_block: block number to be parsed
+ *	@offsets: array to store the offsets in
+ *	@boundary: set this non-zero if the referred-to block is likely to be
+ *	       followed (on disk) by an indirect block.
+ *
+ *	To store the locations of file's data ext4 uses a data structure common
+ *	for UNIX filesystems - tree of pointers anchored in the inode, with
+ *	data blocks at leaves and indirect blocks in intermediate nodes.
+ *	This function translates the block number into path in that tree -
+ *	return value is the path length and @offsets[n] is the offset of
+ *	pointer to (n+1)th node in the nth one. If @block is out of range
+ *	(negative or too large) warning is printed and zero returned.
+ *
+ *	Note: function doesn't find node addresses, so no IO is needed. All
+ *	we need to know is the capacity of indirect blocks (taken from the
+ *	inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext4_block_to_path(struct inode *inode,
+			      ext4_lblk_t i_block,
+			      ext4_lblk_t offsets[4], int *boundary)
+{
+	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	const long direct_blocks = EXT4_NDIR_BLOCKS,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+	int final = 0;
+
+	if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+		final = direct_blocks;
+	} else if ((i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = EXT4_IND_BLOCK;
+		offsets[n++] = i_block;
+		final = ptrs;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = EXT4_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = EXT4_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else {
+		ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
+			     i_block + direct_blocks +
+			     indirect_blocks + double_blocks, inode->i_ino);
+	}
+	if (boundary)
+		*boundary = final - 1 - (i_block & (ptrs - 1));
+	return n;
+}
+
+/**
+ *	ext4_get_branch - read the chain of indirect blocks leading to data
+ *	@inode: inode in question
+ *	@depth: depth of the chain (1 - direct pointer, etc.)
+ *	@offsets: offsets of pointers in inode/indirect blocks
+ *	@chain: place to store the result
+ *	@err: here we store the error value
+ *
+ *	Function fills the array of triples <key, p, bh> and returns %NULL
+ *	if everything went OK or the pointer to the last filled triple
+ *	(incomplete one) otherwise. Upon the return chain[i].key contains
+ *	the number of (i+1)-th block in the chain (as it is stored in memory,
+ *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *	number (it points into struct inode for i==0 and into the bh->b_data
+ *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *	block for i>0 and NULL for i==0. In other words, it holds the block
+ *	numbers of the chain, addresses they were taken from (and where we can
+ *	verify that chain did not change) and buffer_heads hosting these
+ *	numbers.
+ *
+ *	Function stops when it stumbles upon zero pointer (absent block)
+ *		(pointer to last triple returned, *@err == 0)
+ *	or when it gets an IO error reading an indirect block
+ *		(ditto, *@err == -EIO)
+ *	or when it reads all @depth-1 indirect blocks successfully and finds
+ *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ *
+ *      Need to be called with
+ *      down_read(&EXT4_I(inode)->i_data_sem)
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth,
+				 ext4_lblk_t  *offsets,
+				 Indirect chain[4], int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+	int ret = -EIO;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_getblk(sb, le32_to_cpu(p->key));
+		if (unlikely(!bh)) {
+			ret = -ENOMEM;
+			goto failure;
+		}
+
+		if (!bh_uptodate_or_lock(bh)) {
+			if (bh_submit_read(bh) < 0) {
+				put_bh(bh);
+				goto failure;
+			}
+			/* validate block references */
+			if (ext4_check_indirect_blockref(inode, bh)) {
+				put_bh(bh);
+				goto failure;
+			}
+		}
+
+		add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
+		/* Reader: end */
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+failure:
+	*err = ret;
+no_block:
+	return p;
+}
+
+/**
+ *	ext4_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the preferred place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same
+ *	    cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
+	__le32 *p;
+
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--) {
+		if (*p)
+			return le32_to_cpu(*p);
+	}
+
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
+
+	/*
+	 * It is going to be referred to from the inode itself? OK, just put it
+	 * into the same cylinder group then.
+	 */
+	return ext4_inode_to_goal_block(inode);
+}
+
+/**
+ *	ext4_find_goal - find a preferred place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@partial: pointer to the last triple within a chain
+ *
+ *	Normally this function find the preferred place for block allocation,
+ *	returns it.
+ *	Because this is only used for non-extent files, we limit the block nr
+ *	to 32 bits.
+ */
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
+				   Indirect *partial)
+{
+	ext4_fsblk_t goal;
+
+	/*
+	 * XXX need to get goal block from mballoc's data structures
+	 */
+
+	goal = ext4_find_near(inode, partial);
+	goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+	return goal;
+}
+
+/**
+ *	ext4_blks_to_allocate - Look up the block map and count the number
+ *	of direct blocks need to be allocated for the given branch.
+ *
+ *	@branch: chain of indirect blocks
+ *	@k: number of blocks need for indirect blocks
+ *	@blks: number of data blocks to be mapped.
+ *	@blocks_to_boundary:  the offset in the indirect block
+ *
+ *	return the total number of blocks to be allocate, including the
+ *	direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
+				 int blocks_to_boundary)
+{
+	unsigned int count = 0;
+
+	/*
+	 * Simple case, [t,d]Indirect block(s) has not allocated yet
+	 * then it's clear blocks on that path have not allocated
+	 */
+	if (k > 0) {
+		/* right now we don't handle cross boundary allocation */
+		if (blks < blocks_to_boundary + 1)
+			count += blks;
+		else
+			count += blocks_to_boundary + 1;
+		return count;
+	}
+
+	count++;
+	while (count < blks && count <= blocks_to_boundary &&
+		le32_to_cpu(*(branch[0].p + count)) == 0) {
+		count++;
+	}
+	return count;
+}
+
+/**
+ *	ext4_alloc_branch - allocate and set up a chain of blocks.
+ *	@handle: handle for this transaction
+ *	@inode: owner
+ *	@indirect_blks: number of allocated indirect blocks
+ *	@blks: number of allocated direct blocks
+ *	@goal: preferred place for allocation
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
+ *
+ *	This function allocates blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext4_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext4_get_block(), except that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	their buffer_heads) and return the error value the from failed
+ *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+			     ext4_lblk_t iblock, int indirect_blks,
+			     int *blks, ext4_fsblk_t goal,
+			     ext4_lblk_t *offsets, Indirect *branch)
+{
+	struct ext4_allocation_request	ar;
+	struct buffer_head *		bh;
+	ext4_fsblk_t			b, new_blocks[4];
+	__le32				*p;
+	int				i, j, err, len = 1;
+
+	/*
+	 * Set up for the direct block allocation
+	 */
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.len = *blks;
+	ar.logical = iblock;
+	if (S_ISREG(inode->i_mode))
+		ar.flags = EXT4_MB_HINT_DATA;
+
+	for (i = 0; i <= indirect_blks; i++) {
+		if (i == indirect_blks) {
+			ar.goal = goal;
+			new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
+		} else
+			goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode,
+							goal, 0, NULL, &err);
+		if (err) {
+			i--;
+			goto failed;
+		}
+		branch[i].key = cpu_to_le32(new_blocks[i]);
+		if (i == 0)
+			continue;
+
+		bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		err = ext4_journal_get_create_access(handle, bh);
+		if (err) {
+			unlock_buffer(bh);
+			goto failed;
+		}
+
+		memset(bh->b_data, 0, bh->b_size);
+		p = branch[i].p = (__le32 *) bh->b_data + offsets[i];
+		b = new_blocks[i];
+
+		if (i == indirect_blks)
+			len = ar.len;
+		for (j = 0; j < len; j++)
+			*p++ = cpu_to_le32(b++);
+
+		BUFFER_TRACE(bh, "marking uptodate");
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (err)
+			goto failed;
+	}
+	*blks = ar.len;
+	return 0;
+failed:
+	for (; i >= 0; i--) {
+		/*
+		 * We want to ext4_forget() only freshly allocated indirect
+		 * blocks.  Buffer for new_blocks[i-1] is at branch[i].bh and
+		 * buffer at branch[0].bh is indirect block / inode already
+		 * existing before ext4_alloc_branch() was called.
+		 */
+		if (i > 0 && i != indirect_blks && branch[i].bh)
+			ext4_forget(handle, 1, inode, branch[i].bh,
+				    branch[i].bh->b_blocknr);
+		ext4_free_blocks(handle, inode, NULL, new_blocks[i],
+				 (i == indirect_blks) ? ar.len : 1, 0);
+	}
+	return err;
+}
+
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @handle: handle for this transaction
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ *	ext4_alloc_branch)
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+			      ext4_lblk_t block, Indirect *where, int num,
+			      int blks)
+{
+	int i;
+	int err = 0;
+	ext4_fsblk_t current_block;
+
+	/*
+	 * If we're splicing into a [td]indirect block (as opposed to the
+	 * inode) then we need to get write access to the [td]indirect block
+	 * before the splice.
+	 */
+	if (where->bh) {
+		BUFFER_TRACE(where->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, where->bh);
+		if (err)
+			goto err_out;
+	}
+	/* That's it */
+
+	*where->p = where->key;
+
+	/*
+	 * Update the host buffer_head or inode to point to more just allocated
+	 * direct blocks blocks
+	 */
+	if (num == 0 && blks > 1) {
+		current_block = le32_to_cpu(where->key) + 1;
+		for (i = 1; i < blks; i++)
+			*(where->p + i) = cpu_to_le32(current_block++);
+	}
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+	/* had we spliced it onto indirect block? */
+	if (where->bh) {
+		/*
+		 * If we spliced it onto an indirect block, we haven't
+		 * altered the inode.  Note however that if it is being spliced
+		 * onto an indirect block at the very end of the file (the
+		 * file is growing) then we *will* alter the inode to reflect
+		 * the new i_size.  But that is not done here - it is done in
+		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+		 */
+		jbd_debug(5, "splicing indirect only\n");
+		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, where->bh);
+		if (err)
+			goto err_out;
+	} else {
+		/*
+		 * OK, we spliced it into the inode itself on a direct block.
+		 */
+		ext4_mark_inode_dirty(handle, inode);
+		jbd_debug(5, "splicing direct\n");
+	}
+	return err;
+
+err_out:
+	for (i = 1; i <= num; i++) {
+		/*
+		 * branch[i].bh is newly allocated, so there is no
+		 * need to revoke the block, which is why we don't
+		 * need to set EXT4_FREE_BLOCKS_METADATA.
+		 */
+		ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+				 EXT4_FREE_BLOCKS_FORGET);
+	}
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
+			 blks, 0);
+
+	return err;
+}
+
+/*
+ * The ext4_ind_map_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_map_blocks().
+ *
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ *
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
+ */
+int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map,
+			int flags)
+{
+	int err = -EIO;
+	ext4_lblk_t offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	ext4_fsblk_t goal;
+	int indirect_blks;
+	int blocks_to_boundary = 0;
+	int depth;
+	int count = 0;
+	ext4_fsblk_t first_block = 0;
+
+	trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
+	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
+				   &blocks_to_boundary);
+
+	if (depth == 0)
+		goto out;
+
+	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+		first_block = le32_to_cpu(chain[depth - 1].key);
+		count++;
+		/*map more blocks*/
+		while (count < map->m_len && count <= blocks_to_boundary) {
+			ext4_fsblk_t blk;
+
+			blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+			if (blk == first_block + count)
+				count++;
+			else
+				break;
+		}
+		goto got_it;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+		goto cleanup;
+
+	/*
+	 * Okay, we need to do block allocation.
+	*/
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+		EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
+				 "non-extent mapped inodes with bigalloc");
+		return -ENOSPC;
+	}
+
+	goal = ext4_find_goal(inode, map->m_lblk, partial);
+
+	/* the number of blocks need to allocate for [d,t]indirect blocks */
+	indirect_blks = (chain + depth) - partial - 1;
+
+	/*
+	 * Next look up the indirect map to count the totoal number of
+	 * direct blocks to allocate for this branch.
+	 */
+	count = ext4_blks_to_allocate(partial, indirect_blks,
+				      map->m_len, blocks_to_boundary);
+	/*
+	 * Block out ext4_truncate while we alter the tree
+	 */
+	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
+				&count, goal,
+				offsets + (partial - chain), partial);
+
+	/*
+	 * The ext4_splice_branch call will free and forget any buffers
+	 * on the new chain if there is a failure, but that risks using
+	 * up transaction credits, especially for bitmaps where the
+	 * credits cannot be returned.  Can we handle this somehow?  We
+	 * may need to return -EAGAIN upwards in the worst case.  --sct
+	 */
+	if (!err)
+		err = ext4_splice_branch(handle, inode, map->m_lblk,
+					 partial, indirect_blks, count);
+	if (err)
+		goto cleanup;
+
+	map->m_flags |= EXT4_MAP_NEW;
+
+	ext4_update_inode_fsync_trans(handle, inode, 1);
+got_it:
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = le32_to_cpu(chain[depth-1].key);
+	map->m_len = count;
+	if (count > blocks_to_boundary)
+		map->m_flags |= EXT4_MAP_BOUNDARY;
+	err = count;
+	/* Clean up and exit */
+	partial = chain + depth - 1;	/* the whole chain */
+cleanup:
+	while (partial > chain) {
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse(partial->bh);
+		partial--;
+	}
+out:
+	trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
+	return err;
+}
+
+/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+			   struct iov_iter *iter, loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	handle_t *handle;
+	ssize_t ret;
+	int orphan = 0;
+	size_t count = iov_iter_count(iter);
+	int retries = 0;
+
+	if (rw == WRITE) {
+		loff_t final_size = offset + count;
+
+		if (final_size > inode->i_size) {
+			/* Credits for sb + inode write */
+			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				goto out;
+			}
+			ret = ext4_orphan_add(handle, inode);
+			if (ret) {
+				ext4_journal_stop(handle);
+				goto out;
+			}
+			orphan = 1;
+			ei->i_disksize = inode->i_size;
+			ext4_journal_stop(handle);
+		}
+	}
+
+retry:
+	if (rw == READ && ext4_should_dioread_nolock(inode)) {
+		/*
+		 * Nolock dioread optimization may be dynamically disabled
+		 * via ext4_inode_block_unlocked_dio(). Check inode's state
+		 * while holding extra i_dio_count ref.
+		 */
+		atomic_inc(&inode->i_dio_count);
+		smp_mb();
+		if (unlikely(ext4_test_inode_state(inode,
+						    EXT4_STATE_DIOREAD_LOCK))) {
+			inode_dio_done(inode);
+			goto locked;
+		}
+		ret = __blockdev_direct_IO(rw, iocb, inode,
+				 inode->i_sb->s_bdev, iter, offset,
+				 ext4_get_block, NULL, NULL, 0);
+		inode_dio_done(inode);
+	} else {
+locked:
+		ret = blockdev_direct_IO(rw, iocb, inode, iter,
+				 offset, ext4_get_block);
+
+		if (unlikely((rw & WRITE) && ret < 0)) {
+			loff_t isize = i_size_read(inode);
+			loff_t end = offset + count;
+
+			if (end > isize)
+				ext4_truncate_failed_write(inode);
+		}
+	}
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	if (orphan) {
+		int err;
+
+		/* Credits for sb + inode write */
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+		if (IS_ERR(handle)) {
+			/* This is really bad luck. We've written the data
+			 * but cannot extend i_size. Bail out and pretend
+			 * the write failed... */
+			ret = PTR_ERR(handle);
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+
+			goto out;
+		}
+		if (inode->i_nlink)
+			ext4_orphan_del(handle, inode);
+		if (ret > 0) {
+			loff_t end = offset + ret;
+			if (end > inode->i_size) {
+				ei->i_disksize = end;
+				i_size_write(inode, end);
+				/*
+				 * We're going to return a positive `ret'
+				 * here due to non-zero-length I/O, so there's
+				 * no way of reporting error returns from
+				 * ext4_mark_inode_dirty() to userspace.  So
+				 * ignore it.
+				 */
+				ext4_mark_inode_dirty(handle, inode);
+			}
+		}
+		err = ext4_journal_stop(handle);
+		if (ret == 0)
+			ret = err;
+	}
+out:
+	return ret;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a new block at @lblocks for non extent file based file
+ */
+int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+	int blk_bits;
+
+	if (lblock < EXT4_NDIR_BLOCKS)
+		return 0;
+
+	lblock -= EXT4_NDIR_BLOCKS;
+
+	if (ei->i_da_metadata_calc_len &&
+	    (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+		ei->i_da_metadata_calc_len++;
+		return 0;
+	}
+	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+	ei->i_da_metadata_calc_len = 1;
+	blk_bits = order_base_2(lblock);
+	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+}
+
+/*
+ * Calculate number of indirect blocks touched by mapping @nrblocks logically
+ * contiguous blocks
+ */
+int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
+{
+	/*
+	 * With N contiguous data blocks, we need at most
+	 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+	 * 2 dindirect blocks, and 1 tindirect block
+	 */
+	return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge.  So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * Try to extend this transaction for the purposes of truncation.  If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ *
+ * Returns 0 if we managed to create more room.  If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+	if (!ext4_handle_valid(handle))
+		return 0;
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
+		return 0;
+	if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
+		return 0;
+	return 1;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+/**
+ *	ext4_find_shared - find the indirect blocks for partial truncation.
+ *	@inode:	  inode in question
+ *	@depth:	  depth of the affected branch
+ *	@offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ *	@chain:	  place to store the pointers to partial indirect blocks
+ *	@top:	  place to the (detached) top of branch
+ *
+ *	This is a helper function used by ext4_truncate().
+ *
+ *	When we do truncate() we may have to clean the ends of several
+ *	indirect blocks but leave the blocks themselves alive. Block is
+ *	partially truncated if some data below the new i_size is referred
+ *	from it (and it is on the path to the first completely truncated
+ *	data block, indeed).  We have to free the top of that path along
+ *	with everything to the right of the path. Since no allocation
+ *	past the truncation point is possible until ext4_truncate()
+ *	finishes, we may safely do the latter, but top of branch may
+ *	require special attention - pageout below the truncation point
+ *	might try to populate it.
+ *
+ *	We atomically detach the top of branch from the tree, store the
+ *	block number of its root in *@top, pointers to buffer_heads of
+ *	partially truncated blocks - in @chain[].bh and pointers to
+ *	their last elements that should not be removed - in
+ *	@chain[].p. Return value is the pointer to last filled element
+ *	of @chain.
+ *
+ *	The work left to caller to do the actual freeing of subtrees:
+ *		a) free the subtree starting from *@top
+ *		b) free the subtrees whose roots are stored in
+ *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *		c) free the subtrees growing from the inode past the @chain[0].
+ *			(no partially truncated stuff there).  */
+
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+				  ext4_lblk_t offsets[4], Indirect chain[4],
+				  __le32 *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	/* Make k index the deepest non-null offset + 1 */
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = ext4_get_branch(inode, k, offsets, chain, &err);
+	/* Writer: pointers */
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	if (!partial->key && *partial->p)
+		/* Writer: end */
+		goto no_top;
+	for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		/* Nope, don't do this in ext4.  Must leave the tree intact */
+#if 0
+		*p->p = 0;
+#endif
+	}
+	/* Writer: end */
+
+	while (partial > p) {
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
+ */
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+			     struct buffer_head *bh,
+			     ext4_fsblk_t block_to_free,
+			     unsigned long count, __le32 *first,
+			     __le32 *last)
+{
+	__le32 *p;
+	int	flags = EXT4_FREE_BLOCKS_VALIDATED;
+	int	err;
+
+	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
+	else if (ext4_should_journal_data(inode))
+		flags |= EXT4_FREE_BLOCKS_FORGET;
+
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+				   count)) {
+		EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+				 "blocks %llu len %lu",
+				 (unsigned long long) block_to_free, count);
+		return 1;
+	}
+
+	if (try_to_extend_transaction(handle, inode)) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			err = ext4_handle_dirty_metadata(handle, inode, bh);
+			if (unlikely(err))
+				goto out_err;
+		}
+		err = ext4_mark_inode_dirty(handle, inode);
+		if (unlikely(err))
+			goto out_err;
+		err = ext4_truncate_restart_trans(handle, inode,
+					ext4_blocks_for_truncate(inode));
+		if (unlikely(err))
+			goto out_err;
+		if (bh) {
+			BUFFER_TRACE(bh, "retaking write access");
+			err = ext4_journal_get_write_access(handle, bh);
+			if (unlikely(err))
+				goto out_err;
+		}
+	}
+
+	for (p = first; p < last; p++)
+		*p = 0;
+
+	ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
+	return 0;
+out_err:
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle:	handle for this transaction
+ * @inode:	inode we are dealing with
+ * @this_bh:	indirect buffer_head which contains *@first and *@last
+ * @first:	array of block numbers
+ * @last:	points immediately past the end of array
+ *
+ * We are freeing all blocks referred from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+			   struct buffer_head *this_bh,
+			   __le32 *first, __le32 *last)
+{
+	ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
+	unsigned long count = 0;	    /* Number of blocks in the run */
+	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
+					       corresponding to
+					       block_to_free */
+	ext4_fsblk_t nr;		    /* Current block # */
+	__le32 *p;			    /* Pointer into inode/ind
+					       for current block */
+	int err = 0;
+
+	if (this_bh) {				/* For indirect block */
+		BUFFER_TRACE(this_bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, this_bh);
+		/* Important: if we can't update the indirect pointers
+		 * to the blocks, we can't free them. */
+		if (err)
+			return;
+	}
+
+	for (p = first; p < last; p++) {
+		nr = le32_to_cpu(*p);
+		if (nr) {
+			/* accumulate blocks to free if they're contiguous */
+			if (count == 0) {
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			} else if (nr == block_to_free + count) {
+				count++;
+			} else {
+				err = ext4_clear_blocks(handle, inode, this_bh,
+						        block_to_free, count,
+						        block_to_free_p, p);
+				if (err)
+					break;
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			}
+		}
+	}
+
+	if (!err && count > 0)
+		err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+					count, block_to_free_p, p);
+	if (err < 0)
+		/* fatal error */
+		return;
+
+	if (this_bh) {
+		BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
+			ext4_handle_dirty_metadata(handle, inode, this_bh);
+		else
+			EXT4_ERROR_INODE(inode,
+					 "circular indirect block detected at "
+					 "block %llu",
+				(unsigned long long) this_bh->b_blocknr);
+	}
+}
+
+/**
+ *	ext4_free_branches - free an array of branches
+ *	@handle: JBD handle for this transaction
+ *	@inode:	inode we are dealing with
+ *	@parent_bh: the buffer_head which contains *@first and *@last
+ *	@first:	array of block numbers
+ *	@last:	pointer immediately past the end of array
+ *	@depth:	depth of the branches to free
+ *
+ *	We are freeing all blocks referred from these branches (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+			       struct buffer_head *parent_bh,
+			       __le32 *first, __le32 *last, int depth)
+{
+	ext4_fsblk_t nr;
+	__le32 *p;
+
+	if (ext4_handle_is_aborted(handle))
+		return;
+
+	if (depth--) {
+		struct buffer_head *bh;
+		int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+		p = last;
+		while (--p >= first) {
+			nr = le32_to_cpu(*p);
+			if (!nr)
+				continue;		/* A hole */
+
+			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+						   nr, 1)) {
+				EXT4_ERROR_INODE(inode,
+						 "invalid indirect mapped "
+						 "block %lu (level %d)",
+						 (unsigned long) nr, depth);
+				break;
+			}
+
+			/* Go read the buffer for the next level down */
+			bh = sb_bread(inode->i_sb, nr);
+
+			/*
+			 * A read failure? Report error and clear slot
+			 * (should be rare).
+			 */
+			if (!bh) {
+				EXT4_ERROR_INODE_BLOCK(inode, nr,
+						       "Read failure");
+				continue;
+			}
+
+			/* This zaps the entire block.  Bottom up. */
+			BUFFER_TRACE(bh, "free child branches");
+			ext4_free_branches(handle, inode, bh,
+					(__le32 *) bh->b_data,
+					(__le32 *) bh->b_data + addr_per_block,
+					depth);
+			brelse(bh);
+
+			/*
+			 * Everything below this this pointer has been
+			 * released.  Now let this top-of-subtree go.
+			 *
+			 * We want the freeing of this indirect block to be
+			 * atomic in the journal with the updating of the
+			 * bitmap block which owns it.  So make some room in
+			 * the journal.
+			 *
+			 * We zero the parent pointer *after* freeing its
+			 * pointee in the bitmaps, so if extend_transaction()
+			 * for some reason fails to put the bitmap changes and
+			 * the release into the same transaction, recovery
+			 * will merely complain about releasing a free block,
+			 * rather than leaking blocks.
+			 */
+			if (ext4_handle_is_aborted(handle))
+				return;
+			if (try_to_extend_transaction(handle, inode)) {
+				ext4_mark_inode_dirty(handle, inode);
+				ext4_truncate_restart_trans(handle, inode,
+					    ext4_blocks_for_truncate(inode));
+			}
+
+			/*
+			 * The forget flag here is critical because if
+			 * we are journaling (and not doing data
+			 * journaling), we have to make sure a revoke
+			 * record is written to prevent the journal
+			 * replay from overwriting the (former)
+			 * indirect block if it gets reallocated as a
+			 * data block.  This must happen in the same
+			 * transaction where the data blocks are
+			 * actually freed.
+			 */
+			ext4_free_blocks(handle, inode, NULL, nr, 1,
+					 EXT4_FREE_BLOCKS_METADATA|
+					 EXT4_FREE_BLOCKS_FORGET);
+
+			if (parent_bh) {
+				/*
+				 * The block which we have just freed is
+				 * pointed to by an indirect block: journal it
+				 */
+				BUFFER_TRACE(parent_bh, "get_write_access");
+				if (!ext4_journal_get_write_access(handle,
+								   parent_bh)){
+					*p = 0;
+					BUFFER_TRACE(parent_bh,
+					"call ext4_handle_dirty_metadata");
+					ext4_handle_dirty_metadata(handle,
+								   inode,
+								   parent_bh);
+				}
+			}
+		}
+	} else {
+		/* We have reached the bottom of the tree. */
+		BUFFER_TRACE(parent_bh, "free data blocks");
+		ext4_free_data(handle, inode, parent_bh, first, last);
+	}
+}
+
+void ext4_ind_truncate(handle_t *handle, struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__le32 *i_data = ei->i_data;
+	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	ext4_lblk_t offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	__le32 nr = 0;
+	int n = 0;
+	ext4_lblk_t last_block, max_block;
+	unsigned blocksize = inode->i_sb->s_blocksize;
+
+	last_block = (inode->i_size + blocksize-1)
+					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+	max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+
+	if (last_block != max_block) {
+		n = ext4_block_to_path(inode, last_block, offsets, NULL);
+		if (n == 0)
+			return;
+	}
+
+	ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
+
+	/*
+	 * The orphan list entry will now protect us from any crash which
+	 * occurs before the truncate completes, so it is now safe to propagate
+	 * the new, shorter inode size (held for now in i_size) into the
+	 * on-disk inode. We do this via i_disksize, which is the value which
+	 * ext4 *really* writes onto the disk inode.
+	 */
+	ei->i_disksize = inode->i_size;
+
+	if (last_block == max_block) {
+		/*
+		 * It is unnecessary to free any data blocks if last_block is
+		 * equal to the indirect block limit.
+		 */
+		return;
+	} else if (n == 1) {		/* direct blocks */
+		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+			       i_data + EXT4_NDIR_BLOCKS);
+		goto do_indirects;
+	}
+
+	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (not detached) */
+	if (nr) {
+		if (partial == chain) {
+			/* Shared branch grows from the inode */
+			ext4_free_branches(handle, inode, NULL,
+					   &nr, &nr+1, (chain+n-1) - partial);
+			*partial->p = 0;
+			/*
+			 * We mark the inode dirty prior to restart,
+			 * and prior to stop.  No need for it here.
+			 */
+		} else {
+			/* Shared branch grows from an indirect block */
+			BUFFER_TRACE(partial->bh, "get_write_access");
+			ext4_free_branches(handle, inode, partial->bh,
+					partial->p,
+					partial->p+1, (chain+n-1) - partial);
+		}
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+				   (__le32*)partial->bh->b_data+addr_per_block,
+				   (chain+n-1) - partial);
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse(partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	switch (offsets[0]) {
+	default:
+		nr = i_data[EXT4_IND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+			i_data[EXT4_IND_BLOCK] = 0;
+		}
+	case EXT4_IND_BLOCK:
+		nr = i_data[EXT4_DIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+			i_data[EXT4_DIND_BLOCK] = 0;
+		}
+	case EXT4_DIND_BLOCK:
+		nr = i_data[EXT4_TIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+			i_data[EXT4_TIND_BLOCK] = 0;
+		}
+	case EXT4_TIND_BLOCK:
+		;
+	}
+}
+
+static int free_hole_blocks(handle_t *handle, struct inode *inode,
+			    struct buffer_head *parent_bh, __le32 *i_data,
+			    int level, ext4_lblk_t first,
+			    ext4_lblk_t count, int max)
+{
+	struct buffer_head *bh = NULL;
+	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ret = 0;
+	int i, inc;
+	ext4_lblk_t offset;
+	__le32 blk;
+
+	inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level);
+	for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) {
+		if (offset >= count + first)
+			break;
+		if (*i_data == 0 || (offset + inc) <= first)
+			continue;
+		blk = *i_data;
+		if (level > 0) {
+			ext4_lblk_t first2;
+			ext4_lblk_t count2;
+
+			bh = sb_bread(inode->i_sb, le32_to_cpu(blk));
+			if (!bh) {
+				EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),
+						       "Read failure");
+				return -EIO;
+			}
+			if (first > offset) {
+				first2 = first - offset;
+				count2 = count;
+			} else {
+				first2 = 0;
+				count2 = count - (offset - first);
+			}
+			ret = free_hole_blocks(handle, inode, bh,
+					       (__le32 *)bh->b_data, level - 1,
+					       first2, count2,
+					       inode->i_sb->s_blocksize >> 2);
+			if (ret) {
+				brelse(bh);
+				goto err;
+			}
+		}
+		if (level == 0 ||
+		    (bh && all_zeroes((__le32 *)bh->b_data,
+				      (__le32 *)bh->b_data + addr_per_block))) {
+			ext4_free_data(handle, inode, parent_bh,
+				       i_data, i_data + 1);
+		}
+		brelse(bh);
+		bh = NULL;
+	}
+
+err:
+	return ret;
+}
+
+int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+			  ext4_lblk_t first, ext4_lblk_t stop)
+{
+	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int level, ret = 0;
+	int num = EXT4_NDIR_BLOCKS;
+	ext4_lblk_t count, max = EXT4_NDIR_BLOCKS;
+	__le32 *i_data = EXT4_I(inode)->i_data;
+
+	count = stop - first;
+	for (level = 0; level < 4; level++, max *= addr_per_block) {
+		if (first < max) {
+			ret = free_hole_blocks(handle, inode, NULL, i_data,
+					       level, first, count, num);
+			if (ret)
+				goto err;
+			if (count > max - first)
+				count -= max - first;
+			else
+				break;
+			first = 0;
+		} else {
+			first -= max;
+		}
+		i_data += num;
+		if (level == 0) {
+			num = 1;
+			max = 1;
+		}
+	}
+
+err:
+	return ret;
+}
+
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
new file mode 100644
index 00000000000..645205d8ada
--- /dev/null
+++ b/fs/ext4/inline.c
@@ -0,0 +1,2000 @@
+/*
+ * Copyright (c) 2012 Taobao.
+ * Written by Tao Ma <boyu.mt@taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "truncate.h"
+#include <linux/fiemap.h>
+
+#define EXT4_XATTR_SYSTEM_DATA	"data"
+#define EXT4_MIN_INLINE_DATA_SIZE	((sizeof(__le32) * EXT4_N_BLOCKS))
+#define EXT4_INLINE_DOTDOT_OFFSET	2
+#define EXT4_INLINE_DOTDOT_SIZE		4
+
+static int ext4_get_inline_size(struct inode *inode)
+{
+	if (EXT4_I(inode)->i_inline_off)
+		return EXT4_I(inode)->i_inline_size;
+
+	return 0;
+}
+
+static int get_max_inline_xattr_value_size(struct inode *inode,
+					   struct ext4_iloc *iloc)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+	struct ext4_inode *raw_inode;
+	int free, min_offs;
+
+	min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
+			EXT4_GOOD_OLD_INODE_SIZE -
+			EXT4_I(inode)->i_extra_isize -
+			sizeof(struct ext4_xattr_ibody_header);
+
+	/*
+	 * We need to subtract another sizeof(__u32) since an in-inode xattr
+	 * needs an empty 4 bytes to indicate the gap between the xattr entry
+	 * and the name/value pair.
+	 */
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+		return EXT4_XATTR_SIZE(min_offs -
+			EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
+			EXT4_XATTR_ROUND - sizeof(__u32));
+
+	raw_inode = ext4_raw_inode(iloc);
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+
+	/* Compute min_offs. */
+	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+		if (!entry->e_value_block && entry->e_value_size) {
+			size_t offs = le16_to_cpu(entry->e_value_offs);
+			if (offs < min_offs)
+				min_offs = offs;
+		}
+	}
+	free = min_offs -
+		((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
+
+	if (EXT4_I(inode)->i_inline_off) {
+		entry = (struct ext4_xattr_entry *)
+			((void *)raw_inode + EXT4_I(inode)->i_inline_off);
+
+		free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
+		goto out;
+	}
+
+	free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
+
+	if (free > EXT4_XATTR_ROUND)
+		free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
+	else
+		free = 0;
+
+out:
+	return free;
+}
+
+/*
+ * Get the maximum size we now can store in an inode.
+ * If we can't find the space for a xattr entry, don't use the space
+ * of the extents since we have no space to indicate the inline data.
+ */
+int ext4_get_max_inline_size(struct inode *inode)
+{
+	int error, max_inline_size;
+	struct ext4_iloc iloc;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return 0;
+
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error) {
+		ext4_error_inode(inode, __func__, __LINE__, 0,
+				 "can't get inode location %lu",
+				 inode->i_ino);
+		return 0;
+	}
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
+	up_read(&EXT4_I(inode)->xattr_sem);
+
+	brelse(iloc.bh);
+
+	if (!max_inline_size)
+		return 0;
+
+	return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
+}
+
+int ext4_has_inline_data(struct inode *inode)
+{
+	return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+	       EXT4_I(inode)->i_inline_off;
+}
+
+/*
+ * this function does not take xattr_sem, which is OK because it is
+ * currently only used in a code path coming form ext4_iget, before
+ * the new inode has been unlocked
+ */
+int ext4_find_inline_data_nolock(struct inode *inode)
+{
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+	};
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return 0;
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		return error;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	if (!is.s.not_found) {
+		EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+					(void *)ext4_raw_inode(&is.iloc));
+		EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+				le32_to_cpu(is.s.here->e_value_size);
+		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+	}
+out:
+	brelse(is.iloc.bh);
+	return error;
+}
+
+static int ext4_read_inline_data(struct inode *inode, void *buffer,
+				 unsigned int len,
+				 struct ext4_iloc *iloc)
+{
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_ibody_header *header;
+	int cp_len = 0;
+	struct ext4_inode *raw_inode;
+
+	if (!len)
+		return 0;
+
+	BUG_ON(len > EXT4_I(inode)->i_inline_size);
+
+	cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
+			len : EXT4_MIN_INLINE_DATA_SIZE;
+
+	raw_inode = ext4_raw_inode(iloc);
+	memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
+
+	len -= cp_len;
+	buffer += cp_len;
+
+	if (!len)
+		goto out;
+
+	header = IHDR(inode, raw_inode);
+	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+					    EXT4_I(inode)->i_inline_off);
+	len = min_t(unsigned int, len,
+		    (unsigned int)le32_to_cpu(entry->e_value_size));
+
+	memcpy(buffer,
+	       (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
+	cp_len += len;
+
+out:
+	return cp_len;
+}
+
+/*
+ * write the buffer to the inline inode.
+ * If 'create' is set, we don't need to do the extra copy in the xattr
+ * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * That saves us one memcpy.
+ */
+static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+				   void *buffer, loff_t pos, unsigned int len)
+{
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	int cp_len = 0;
+
+	BUG_ON(!EXT4_I(inode)->i_inline_off);
+	BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
+
+	raw_inode = ext4_raw_inode(iloc);
+	buffer += pos;
+
+	if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
+		cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
+			 EXT4_MIN_INLINE_DATA_SIZE - pos : len;
+		memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
+
+		len -= cp_len;
+		buffer += cp_len;
+		pos += cp_len;
+	}
+
+	if (!len)
+		return;
+
+	pos -= EXT4_MIN_INLINE_DATA_SIZE;
+	header = IHDR(inode, raw_inode);
+	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+					    EXT4_I(inode)->i_inline_off);
+
+	memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
+	       buffer, len);
+}
+
+static int ext4_create_inline_data(handle_t *handle,
+				   struct inode *inode, unsigned len)
+{
+	int error;
+	void *value = NULL;
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+	};
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		return error;
+
+	BUFFER_TRACE(is.iloc.bh, "get_write_access");
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto out;
+
+	if (len > EXT4_MIN_INLINE_DATA_SIZE) {
+		value = EXT4_ZERO_XATTR_VALUE;
+		len -= EXT4_MIN_INLINE_DATA_SIZE;
+	} else {
+		value = "";
+		len = 0;
+	}
+
+	/* Insert the the xttr entry. */
+	i.value = value;
+	i.value_len = len;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	BUG_ON(!is.s.not_found);
+
+	error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+	if (error) {
+		if (error == -ENOSPC)
+			ext4_clear_inode_state(inode,
+					       EXT4_STATE_MAY_INLINE_DATA);
+		goto out;
+	}
+
+	memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+		0, EXT4_MIN_INLINE_DATA_SIZE);
+
+	EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+				      (void *)ext4_raw_inode(&is.iloc));
+	EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
+	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+	ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+	get_bh(is.iloc.bh);
+	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+	brelse(is.iloc.bh);
+	return error;
+}
+
+static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
+				   unsigned int len)
+{
+	int error;
+	void *value = NULL;
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+	};
+
+	/* If the old space is ok, write the data directly. */
+	if (len <= EXT4_I(inode)->i_inline_size)
+		return 0;
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		return error;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	BUG_ON(is.s.not_found);
+
+	len -= EXT4_MIN_INLINE_DATA_SIZE;
+	value = kzalloc(len, GFP_NOFS);
+	if (!value)
+		goto out;
+
+	error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
+				     value, len);
+	if (error == -ENODATA)
+		goto out;
+
+	BUFFER_TRACE(is.iloc.bh, "get_write_access");
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto out;
+
+	/* Update the xttr entry. */
+	i.value = value;
+	i.value_len = len;
+
+	error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+	if (error)
+		goto out;
+
+	EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+				      (void *)ext4_raw_inode(&is.iloc));
+	EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+				le32_to_cpu(is.s.here->e_value_size);
+	ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+	get_bh(is.iloc.bh);
+	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+	kfree(value);
+	brelse(is.iloc.bh);
+	return error;
+}
+
+static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+				    unsigned int len)
+{
+	int ret, size;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+		return -ENOSPC;
+
+	size = ext4_get_max_inline_size(inode);
+	if (size < len)
+		return -ENOSPC;
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+
+	if (ei->i_inline_off)
+		ret = ext4_update_inline_data(handle, inode, len);
+	else
+		ret = ext4_create_inline_data(handle, inode, len);
+
+	up_write(&EXT4_I(inode)->xattr_sem);
+
+	return ret;
+}
+
+static int ext4_destroy_inline_data_nolock(handle_t *handle,
+					   struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = 0, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+		.value = NULL,
+		.value_len = 0,
+	};
+	int error;
+
+	if (!ei->i_inline_off)
+		return 0;
+
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		return error;
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto out;
+
+	BUFFER_TRACE(is.iloc.bh, "get_write_access");
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto out;
+
+	error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+	if (error)
+		goto out;
+
+	memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+		0, EXT4_MIN_INLINE_DATA_SIZE);
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+		if (S_ISDIR(inode->i_mode) ||
+		    S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+			ext4_ext_tree_init(handle, inode);
+		}
+	}
+	ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+
+	get_bh(is.iloc.bh);
+	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+	EXT4_I(inode)->i_inline_off = 0;
+	EXT4_I(inode)->i_inline_size = 0;
+	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+	brelse(is.iloc.bh);
+	if (error == -ENODATA)
+		error = 0;
+	return error;
+}
+
+static int ext4_read_inline_page(struct inode *inode, struct page *page)
+{
+	void *kaddr;
+	int ret = 0;
+	size_t len;
+	struct ext4_iloc iloc;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!ext4_has_inline_data(inode));
+	BUG_ON(page->index);
+
+	if (!EXT4_I(inode)->i_inline_off) {
+		ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
+			     inode->i_ino);
+		goto out;
+	}
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		goto out;
+
+	len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
+	kaddr = kmap_atomic(page);
+	ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr);
+	zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	SetPageUptodate(page);
+	brelse(iloc.bh);
+
+out:
+	return ret;
+}
+
+int ext4_readpage_inline(struct inode *inode, struct page *page)
+{
+	int ret = 0;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		return -EAGAIN;
+	}
+
+	/*
+	 * Current inline data can only exist in the 1st page,
+	 * So for all the other pages, just set them uptodate.
+	 */
+	if (!page->index)
+		ret = ext4_read_inline_page(inode, page);
+	else if (!PageUptodate(page)) {
+		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+	}
+
+	up_read(&EXT4_I(inode)->xattr_sem);
+
+	unlock_page(page);
+	return ret >= 0 ? 0 : ret;
+}
+
+static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
+					      struct inode *inode,
+					      unsigned flags)
+{
+	int ret, needed_blocks;
+	handle_t *handle = NULL;
+	int retries = 0, sem_held = 0;
+	struct page *page = NULL;
+	unsigned from, to;
+	struct ext4_iloc iloc;
+
+	if (!ext4_has_inline_data(inode)) {
+		/*
+		 * clear the flag so that no new write
+		 * will trap here again.
+		 */
+		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+		return 0;
+	}
+
+	needed_blocks = ext4_writepage_trans_blocks(inode);
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+retry:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
+
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	sem_held = 1;
+	/* If some one has already done this for us, just exit. */
+	if (!ext4_has_inline_data(inode)) {
+		ret = 0;
+		goto out;
+	}
+
+	from = 0;
+	to = ext4_get_inline_size(inode);
+	if (!PageUptodate(page)) {
+		ret = ext4_read_inline_page(inode, page);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = ext4_destroy_inline_data_nolock(handle, inode);
+	if (ret)
+		goto out;
+
+	if (ext4_should_dioread_nolock(inode))
+		ret = __block_write_begin(page, from, to, ext4_get_block_write);
+	else
+		ret = __block_write_begin(page, from, to, ext4_get_block);
+
+	if (!ret && ext4_should_journal_data(inode)) {
+		ret = ext4_walk_page_buffers(handle, page_buffers(page),
+					     from, to, NULL,
+					     do_journal_get_write_access);
+	}
+
+	if (ret) {
+		unlock_page(page);
+		page_cache_release(page);
+		ext4_orphan_add(handle, inode);
+		up_write(&EXT4_I(inode)->xattr_sem);
+		sem_held = 0;
+		ext4_journal_stop(handle);
+		handle = NULL;
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might
+		 * still be on the orphan list; we need to
+		 * make sure the inode is removed from the
+		 * orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+	block_commit_write(page, from, to);
+out:
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (sem_held)
+		up_write(&EXT4_I(inode)->xattr_sem);
+	if (handle)
+		ext4_journal_stop(handle);
+	brelse(iloc.bh);
+	return ret;
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+				  struct inode *inode,
+				  loff_t pos, unsigned len,
+				  unsigned flags,
+				  struct page **pagep)
+{
+	int ret;
+	handle_t *handle;
+	struct page *page;
+	struct ext4_iloc iloc;
+
+	if (pos + len > ext4_get_max_inline_size(inode))
+		goto convert;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	/*
+	 * The possible write could happen in the inode,
+	 * so try to reserve the space in inode first.
+	 */
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	ret = ext4_prepare_inline_data(handle, inode, pos + len);
+	if (ret && ret != -ENOSPC)
+		goto out;
+
+	/* We don't have space in inline inode, so convert it to extent. */
+	if (ret == -ENOSPC) {
+		ext4_journal_stop(handle);
+		brelse(iloc.bh);
+		goto convert;
+	}
+
+	flags |= AOP_FLAG_NOFS;
+
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	*pagep = page;
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		ret = 0;
+		unlock_page(page);
+		page_cache_release(page);
+		goto out_up_read;
+	}
+
+	if (!PageUptodate(page)) {
+		ret = ext4_read_inline_page(inode, page);
+		if (ret < 0)
+			goto out_up_read;
+	}
+
+	ret = 1;
+	handle = NULL;
+out_up_read:
+	up_read(&EXT4_I(inode)->xattr_sem);
+out:
+	if (handle)
+		ext4_journal_stop(handle);
+	brelse(iloc.bh);
+	return ret;
+convert:
+	return ext4_convert_inline_data_to_extent(mapping,
+						  inode, flags);
+}
+
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+			       unsigned copied, struct page *page)
+{
+	int ret;
+	void *kaddr;
+	struct ext4_iloc iloc;
+
+	if (unlikely(copied < len)) {
+		if (!PageUptodate(page)) {
+			copied = 0;
+			goto out;
+		}
+	}
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret) {
+		ext4_std_error(inode->i_sb, ret);
+		copied = 0;
+		goto out;
+	}
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	BUG_ON(!ext4_has_inline_data(inode));
+
+	kaddr = kmap_atomic(page);
+	ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+	kunmap_atomic(kaddr);
+	SetPageUptodate(page);
+	/* clear page dirty so that writepages wouldn't work for us. */
+	ClearPageDirty(page);
+
+	up_write(&EXT4_I(inode)->xattr_sem);
+	brelse(iloc.bh);
+out:
+	return copied;
+}
+
+struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+				  unsigned len,
+				  struct page *page)
+{
+	int ret;
+	void *kaddr;
+	struct ext4_iloc iloc;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret) {
+		ext4_std_error(inode->i_sb, ret);
+		return NULL;
+	}
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	kaddr = kmap_atomic(page);
+	ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
+	kunmap_atomic(kaddr);
+	up_write(&EXT4_I(inode)->xattr_sem);
+
+	return iloc.bh;
+}
+
+/*
+ * Try to make the page cache and handle ready for the inline data case.
+ * We can call this function in 2 cases:
+ * 1. The inode is created and the first write exceeds inline size. We can
+ *    clear the inode state safely.
+ * 2. The inode has inline data, then we need to read the data, make it
+ *    update and dirty so that ext4_da_writepages can handle it. We don't
+ *    need to start the journal since the file's metatdata isn't changed now.
+ */
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+						 struct inode *inode,
+						 unsigned flags,
+						 void **fsdata)
+{
+	int ret = 0, inline_size;
+	struct page *page;
+
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page)
+		return -ENOMEM;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+		goto out;
+	}
+
+	inline_size = ext4_get_inline_size(inode);
+
+	if (!PageUptodate(page)) {
+		ret = ext4_read_inline_page(inode, page);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = __block_write_begin(page, 0, inline_size,
+				  ext4_da_get_block_prep);
+	if (ret) {
+		ext4_truncate_failed_write(inode);
+		goto out;
+	}
+
+	SetPageDirty(page);
+	SetPageUptodate(page);
+	ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+	*fsdata = (void *)CONVERT_INLINE_DATA;
+
+out:
+	up_read(&EXT4_I(inode)->xattr_sem);
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	return ret;
+}
+
+/*
+ * Prepare the write for the inline data.
+ * If the the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
+ */
+int ext4_da_write_inline_data_begin(struct address_space *mapping,
+				    struct inode *inode,
+				    loff_t pos, unsigned len,
+				    unsigned flags,
+				    struct page **pagep,
+				    void **fsdata)
+{
+	int ret, inline_size;
+	handle_t *handle;
+	struct page *page;
+	struct ext4_iloc iloc;
+	int retries;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+retry_journal:
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	inline_size = ext4_get_max_inline_size(inode);
+
+	ret = -ENOSPC;
+	if (inline_size >= pos + len) {
+		ret = ext4_prepare_inline_data(handle, inode, pos + len);
+		if (ret && ret != -ENOSPC)
+			goto out_journal;
+	}
+
+	if (ret == -ENOSPC) {
+		ret = ext4_da_convert_inline_data_to_extent(mapping,
+							    inode,
+							    flags,
+							    fsdata);
+		ext4_journal_stop(handle);
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_journal;
+		goto out;
+	}
+
+	/*
+	 * We cannot recurse into the filesystem as the transaction
+	 * is already started.
+	 */
+	flags |= AOP_FLAG_NOFS;
+
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out_journal;
+	}
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		ret = 0;
+		goto out_release_page;
+	}
+
+	if (!PageUptodate(page)) {
+		ret = ext4_read_inline_page(inode, page);
+		if (ret < 0)
+			goto out_release_page;
+	}
+
+	up_read(&EXT4_I(inode)->xattr_sem);
+	*pagep = page;
+	brelse(iloc.bh);
+	return 1;
+out_release_page:
+	up_read(&EXT4_I(inode)->xattr_sem);
+	unlock_page(page);
+	page_cache_release(page);
+out_journal:
+	ext4_journal_stop(handle);
+out:
+	brelse(iloc.bh);
+	return ret;
+}
+
+int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+				  unsigned len, unsigned copied,
+				  struct page *page)
+{
+	int i_size_changed = 0;
+
+	copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos+copied > inode->i_size) {
+		i_size_write(inode, pos+copied);
+		i_size_changed = 1;
+	}
+	unlock_page(page);
+	page_cache_release(page);
+
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+
+	return copied;
+}
+
+#ifdef INLINE_DIR_DEBUG
+void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
+			  void *inline_start, int inline_size)
+{
+	int offset;
+	unsigned short de_len;
+	struct ext4_dir_entry_2 *de = inline_start;
+	void *dlimit = inline_start + inline_size;
+
+	trace_printk("inode %lu\n", dir->i_ino);
+	offset = 0;
+	while ((void *)de < dlimit) {
+		de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
+		trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
+			     offset, de_len, de->name_len, de->name,
+			     de->name_len, le32_to_cpu(de->inode));
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+					 inline_start, inline_size, offset))
+			BUG();
+
+		offset += de_len;
+		de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+	}
+}
+#else
+#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
+#endif
+
+/*
+ * Add a new entry into a inline dir.
+ * It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int ext4_add_dirent_to_inline(handle_t *handle,
+				     struct dentry *dentry,
+				     struct inode *inode,
+				     struct ext4_iloc *iloc,
+				     void *inline_start, int inline_size)
+{
+	struct inode	*dir = dentry->d_parent->d_inode;
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	int		err;
+	struct ext4_dir_entry_2 *de;
+
+	err = ext4_find_dest_de(dir, inode, iloc->bh,
+				inline_start, inline_size,
+				name, namelen, &de);
+	if (err)
+		return err;
+
+	BUFFER_TRACE(iloc->bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, iloc->bh);
+	if (err)
+		return err;
+	ext4_insert_dentry(inode, de, inline_size, name, namelen);
+
+	ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+
+	/*
+	 * XXX shouldn't update any times until successful
+	 * completion of syscall, but too many callers depend
+	 * on this.
+	 *
+	 * XXX similarly, too many callers depend on
+	 * ext4_new_inode() setting the times, but error
+	 * recovery deletes the inode, so the worst that can
+	 * happen is that the times are slightly out of date
+	 * and/or different from the directory change time.
+	 */
+	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+	ext4_update_dx_flag(dir);
+	dir->i_version++;
+	ext4_mark_inode_dirty(handle, dir);
+	return 1;
+}
+
+static void *ext4_get_inline_xattr_pos(struct inode *inode,
+				       struct ext4_iloc *iloc)
+{
+	struct ext4_xattr_entry *entry;
+	struct ext4_xattr_ibody_header *header;
+
+	BUG_ON(!EXT4_I(inode)->i_inline_off);
+
+	header = IHDR(inode, ext4_raw_inode(iloc));
+	entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+					    EXT4_I(inode)->i_inline_off);
+
+	return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
+}
+
+/* Set the final de to cover the whole block. */
+static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+{
+	struct ext4_dir_entry_2 *de, *prev_de;
+	void *limit;
+	int de_len;
+
+	de = (struct ext4_dir_entry_2 *)de_buf;
+	if (old_size) {
+		limit = de_buf + old_size;
+		do {
+			prev_de = de;
+			de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
+			de_buf += de_len;
+			de = (struct ext4_dir_entry_2 *)de_buf;
+		} while (de_buf < limit);
+
+		prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
+							old_size, new_size);
+	} else {
+		/* this is just created, so create an empty entry. */
+		de->inode = 0;
+		de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
+	}
+}
+
+static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+				  struct ext4_iloc *iloc)
+{
+	int ret;
+	int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+	int new_size = get_max_inline_xattr_value_size(dir, iloc);
+
+	if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+		return -ENOSPC;
+
+	ret = ext4_update_inline_data(handle, dir,
+				      new_size + EXT4_MIN_INLINE_DATA_SIZE);
+	if (ret)
+		return ret;
+
+	ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
+			     EXT4_I(dir)->i_inline_size -
+						EXT4_MIN_INLINE_DATA_SIZE);
+	dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
+	return 0;
+}
+
+static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
+				     struct ext4_iloc *iloc,
+				     void *buf, int inline_size)
+{
+	ext4_create_inline_data(handle, inode, inline_size);
+	ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
+	ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+}
+
+static int ext4_finish_convert_inline_dir(handle_t *handle,
+					  struct inode *inode,
+					  struct buffer_head *dir_block,
+					  void *buf,
+					  int inline_size)
+{
+	int err, csum_size = 0, header_size = 0;
+	struct ext4_dir_entry_2 *de;
+	struct ext4_dir_entry_tail *t;
+	void *target = dir_block->b_data;
+
+	/*
+	 * First create "." and ".." and then copy the dir information
+	 * back to the block.
+	 */
+	de = (struct ext4_dir_entry_2 *)target;
+	de = ext4_init_dot_dotdot(inode, de,
+		inode->i_sb->s_blocksize, csum_size,
+		le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
+	header_size = (void *)de - target;
+
+	memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
+		inline_size - EXT4_INLINE_DOTDOT_SIZE);
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	inode->i_size = inode->i_sb->s_blocksize;
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+	ext4_update_final_de(dir_block->b_data,
+			inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
+			inode->i_sb->s_blocksize - csum_size);
+
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(dir_block->b_data,
+				     inode->i_sb->s_blocksize);
+		initialize_dirent_tail(t, inode->i_sb->s_blocksize);
+	}
+	set_buffer_uptodate(dir_block);
+	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+	if (err)
+		goto out;
+	set_buffer_verified(dir_block);
+out:
+	return err;
+}
+
+static int ext4_convert_inline_data_nolock(handle_t *handle,
+					   struct inode *inode,
+					   struct ext4_iloc *iloc)
+{
+	int error;
+	void *buf = NULL;
+	struct buffer_head *data_bh = NULL;
+	struct ext4_map_blocks map;
+	int inline_size;
+
+	inline_size = ext4_get_inline_size(inode);
+	buf = kmalloc(inline_size, GFP_NOFS);
+	if (!buf) {
+		error = -ENOMEM;
+		goto out;
+	}
+
+	error = ext4_read_inline_data(inode, buf, inline_size, iloc);
+	if (error < 0)
+		goto out;
+
+	error = ext4_destroy_inline_data_nolock(handle, inode);
+	if (error)
+		goto out;
+
+	map.m_lblk = 0;
+	map.m_len = 1;
+	map.m_flags = 0;
+	error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
+	if (error < 0)
+		goto out_restore;
+	if (!(map.m_flags & EXT4_MAP_MAPPED)) {
+		error = -EIO;
+		goto out_restore;
+	}
+
+	data_bh = sb_getblk(inode->i_sb, map.m_pblk);
+	if (!data_bh) {
+		error = -ENOMEM;
+		goto out_restore;
+	}
+
+	lock_buffer(data_bh);
+	error = ext4_journal_get_create_access(handle, data_bh);
+	if (error) {
+		unlock_buffer(data_bh);
+		error = -EIO;
+		goto out_restore;
+	}
+	memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
+
+	if (!S_ISDIR(inode->i_mode)) {
+		memcpy(data_bh->b_data, buf, inline_size);
+		set_buffer_uptodate(data_bh);
+		error = ext4_handle_dirty_metadata(handle,
+						   inode, data_bh);
+	} else {
+		error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
+						       buf, inline_size);
+	}
+
+	unlock_buffer(data_bh);
+out_restore:
+	if (error)
+		ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
+
+out:
+	brelse(data_bh);
+	kfree(buf);
+	return error;
+}
+
+/*
+ * Try to add the new entry to the inline data.
+ * If succeeds, return 0. If not, extended the inline dir and copied data to
+ * the new created block.
+ */
+int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+			      struct inode *inode)
+{
+	int ret, inline_size;
+	void *inline_start;
+	struct ext4_iloc iloc;
+	struct inode *dir = dentry->d_parent->d_inode;
+
+	ret = ext4_get_inode_loc(dir, &iloc);
+	if (ret)
+		return ret;
+
+	down_write(&EXT4_I(dir)->xattr_sem);
+	if (!ext4_has_inline_data(dir))
+		goto out;
+
+	inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+						 EXT4_INLINE_DOTDOT_SIZE;
+	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+
+	ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+					inline_start, inline_size);
+	if (ret != -ENOSPC)
+		goto out;
+
+	/* check whether it can be inserted to inline xattr space. */
+	inline_size = EXT4_I(dir)->i_inline_size -
+			EXT4_MIN_INLINE_DATA_SIZE;
+	if (!inline_size) {
+		/* Try to use the xattr space.*/
+		ret = ext4_update_inline_dir(handle, dir, &iloc);
+		if (ret && ret != -ENOSPC)
+			goto out;
+
+		inline_size = EXT4_I(dir)->i_inline_size -
+				EXT4_MIN_INLINE_DATA_SIZE;
+	}
+
+	if (inline_size) {
+		inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+
+		ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+						inline_start, inline_size);
+
+		if (ret != -ENOSPC)
+			goto out;
+	}
+
+	/*
+	 * The inline space is filled up, so create a new block for it.
+	 * As the extent tree will be created, we have to save the inline
+	 * dir first.
+	 */
+	ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
+
+out:
+	ext4_mark_inode_dirty(handle, dir);
+	up_write(&EXT4_I(dir)->xattr_sem);
+	brelse(iloc.bh);
+	return ret;
+}
+
+/*
+ * This function fills a red-black tree with information from an
+ * inlined dir.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+int htree_inlinedir_to_tree(struct file *dir_file,
+			    struct inode *dir, ext4_lblk_t block,
+			    struct dx_hash_info *hinfo,
+			    __u32 start_hash, __u32 start_minor_hash,
+			    int *has_inline_data)
+{
+	int err = 0, count = 0;
+	unsigned int parent_ino;
+	int pos;
+	struct ext4_dir_entry_2 *de;
+	struct inode *inode = file_inode(dir_file);
+	int ret, inline_size = 0;
+	struct ext4_iloc iloc;
+	void *dir_buf = NULL;
+	struct ext4_dir_entry_2 fake;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	inline_size = ext4_get_inline_size(inode);
+	dir_buf = kmalloc(inline_size, GFP_NOFS);
+	if (!dir_buf) {
+		ret = -ENOMEM;
+		up_read(&EXT4_I(inode)->xattr_sem);
+		goto out;
+	}
+
+	ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+	up_read(&EXT4_I(inode)->xattr_sem);
+	if (ret < 0)
+		goto out;
+
+	pos = 0;
+	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+	while (pos < inline_size) {
+		/*
+		 * As inlined dir doesn't store any information about '.' and
+		 * only the inode number of '..' is stored, we have to handle
+		 * them differently.
+		 */
+		if (pos == 0) {
+			fake.inode = cpu_to_le32(inode->i_ino);
+			fake.name_len = 1;
+			strcpy(fake.name, ".");
+			fake.rec_len = ext4_rec_len_to_disk(
+						EXT4_DIR_REC_LEN(fake.name_len),
+						inline_size);
+			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+			de = &fake;
+			pos = EXT4_INLINE_DOTDOT_OFFSET;
+		} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
+			fake.inode = cpu_to_le32(parent_ino);
+			fake.name_len = 2;
+			strcpy(fake.name, "..");
+			fake.rec_len = ext4_rec_len_to_disk(
+						EXT4_DIR_REC_LEN(fake.name_len),
+						inline_size);
+			ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+			de = &fake;
+			pos = EXT4_INLINE_DOTDOT_SIZE;
+		} else {
+			de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
+			pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+			if (ext4_check_dir_entry(inode, dir_file, de,
+					 iloc.bh, dir_buf,
+					 inline_size, pos)) {
+				ret = count;
+				goto out;
+			}
+		}
+
+		ext4fs_dirhash(de->name, de->name_len, hinfo);
+		if ((hinfo->hash < start_hash) ||
+		    ((hinfo->hash == start_hash) &&
+		     (hinfo->minor_hash < start_minor_hash)))
+			continue;
+		if (de->inode == 0)
+			continue;
+		err = ext4_htree_store_dirent(dir_file,
+				   hinfo->hash, hinfo->minor_hash, de);
+		if (err) {
+			count = err;
+			goto out;
+		}
+		count++;
+	}
+	ret = count;
+out:
+	kfree(dir_buf);
+	brelse(iloc.bh);
+	return ret;
+}
+
+/*
+ * So this function is called when the volume is mkfsed with
+ * dir_index disabled. In order to keep f_pos persistent
+ * after we convert from an inlined dir to a blocked based,
+ * we just pretend that we are a normal dir and return the
+ * offset as if '.' and '..' really take place.
+ *
+ */
+int ext4_read_inline_dir(struct file *file,
+			 struct dir_context *ctx,
+			 int *has_inline_data)
+{
+	unsigned int offset, parent_ino;
+	int i;
+	struct ext4_dir_entry_2 *de;
+	struct super_block *sb;
+	struct inode *inode = file_inode(file);
+	int ret, inline_size = 0;
+	struct ext4_iloc iloc;
+	void *dir_buf = NULL;
+	int dotdot_offset, dotdot_size, extra_offset, extra_size;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_read(&EXT4_I(inode)->xattr_sem);
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	inline_size = ext4_get_inline_size(inode);
+	dir_buf = kmalloc(inline_size, GFP_NOFS);
+	if (!dir_buf) {
+		ret = -ENOMEM;
+		up_read(&EXT4_I(inode)->xattr_sem);
+		goto out;
+	}
+
+	ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+	up_read(&EXT4_I(inode)->xattr_sem);
+	if (ret < 0)
+		goto out;
+
+	ret = 0;
+	sb = inode->i_sb;
+	parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+	offset = ctx->pos;
+
+	/*
+	 * dotdot_offset and dotdot_size is the real offset and
+	 * size for ".." and "." if the dir is block based while
+	 * the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
+	 * So we will use extra_offset and extra_size to indicate them
+	 * during the inline dir iteration.
+	 */
+	dotdot_offset = EXT4_DIR_REC_LEN(1);
+	dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
+	extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+	extra_size = extra_offset + inline_size;
+
+	/*
+	 * If the version has changed since the last call to
+	 * readdir(2), then we might be pointing to an invalid
+	 * dirent right now.  Scan from the start of the inline
+	 * dir to make sure.
+	 */
+	if (file->f_version != inode->i_version) {
+		for (i = 0; i < extra_size && i < offset;) {
+			/*
+			 * "." is with offset 0 and
+			 * ".." is dotdot_offset.
+			 */
+			if (!i) {
+				i = dotdot_offset;
+				continue;
+			} else if (i == dotdot_offset) {
+				i = dotdot_size;
+				continue;
+			}
+			/* for other entry, the real offset in
+			 * the buf has to be tuned accordingly.
+			 */
+			de = (struct ext4_dir_entry_2 *)
+				(dir_buf + i - extra_offset);
+			/* It's too expensive to do a full
+			 * dirent test each time round this
+			 * loop, but we do have to test at
+			 * least that it is non-zero.  A
+			 * failure will be detected in the
+			 * dirent test below. */
+			if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+				< EXT4_DIR_REC_LEN(1))
+				break;
+			i += ext4_rec_len_from_disk(de->rec_len,
+						    extra_size);
+		}
+		offset = i;
+		ctx->pos = offset;
+		file->f_version = inode->i_version;
+	}
+
+	while (ctx->pos < extra_size) {
+		if (ctx->pos == 0) {
+			if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
+				goto out;
+			ctx->pos = dotdot_offset;
+			continue;
+		}
+
+		if (ctx->pos == dotdot_offset) {
+			if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
+				goto out;
+			ctx->pos = dotdot_size;
+			continue;
+		}
+
+		de = (struct ext4_dir_entry_2 *)
+			(dir_buf + ctx->pos - extra_offset);
+		if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
+					 extra_size, ctx->pos))
+			goto out;
+		if (le32_to_cpu(de->inode)) {
+			if (!dir_emit(ctx, de->name, de->name_len,
+				      le32_to_cpu(de->inode),
+				      get_dtype(sb, de->file_type)))
+				goto out;
+		}
+		ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
+	}
+out:
+	kfree(dir_buf);
+	brelse(iloc.bh);
+	return ret;
+}
+
+struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+					struct ext4_dir_entry_2 **parent_de,
+					int *retval)
+{
+	struct ext4_iloc iloc;
+
+	*retval = ext4_get_inode_loc(inode, &iloc);
+	if (*retval)
+		return NULL;
+
+	*parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+
+	return iloc.bh;
+}
+
+/*
+ * Try to create the inline data for the new dir.
+ * If it succeeds, return 0, otherwise return the error.
+ * In case of ENOSPC, the caller should create the normal disk layout dir.
+ */
+int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
+			       struct inode *inode)
+{
+	int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+	struct ext4_iloc iloc;
+	struct ext4_dir_entry_2 *de;
+
+	ret = ext4_get_inode_loc(inode, &iloc);
+	if (ret)
+		return ret;
+
+	ret = ext4_prepare_inline_data(handle, inode, inline_size);
+	if (ret)
+		goto out;
+
+	/*
+	 * For inline dir, we only save the inode information for the ".."
+	 * and create a fake dentry to cover the left space.
+	 */
+	de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+	de->inode = cpu_to_le32(parent->i_ino);
+	de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
+	de->inode = 0;
+	de->rec_len = ext4_rec_len_to_disk(
+				inline_size - EXT4_INLINE_DOTDOT_SIZE,
+				inline_size);
+	set_nlink(inode, 2);
+	inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
+out:
+	brelse(iloc.bh);
+	return ret;
+}
+
+struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+					const struct qstr *d_name,
+					struct ext4_dir_entry_2 **res_dir,
+					int *has_inline_data)
+{
+	int ret;
+	struct ext4_iloc iloc;
+	void *inline_start;
+	int inline_size;
+
+	if (ext4_get_inode_loc(dir, &iloc))
+		return NULL;
+
+	down_read(&EXT4_I(dir)->xattr_sem);
+	if (!ext4_has_inline_data(dir)) {
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+						EXT4_INLINE_DOTDOT_SIZE;
+	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+	ret = search_dir(iloc.bh, inline_start, inline_size,
+			 dir, d_name, 0, res_dir);
+	if (ret == 1)
+		goto out_find;
+	if (ret < 0)
+		goto out;
+
+	if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
+		goto out;
+
+	inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+	inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
+
+	ret = search_dir(iloc.bh, inline_start, inline_size,
+			 dir, d_name, 0, res_dir);
+	if (ret == 1)
+		goto out_find;
+
+out:
+	brelse(iloc.bh);
+	iloc.bh = NULL;
+out_find:
+	up_read(&EXT4_I(dir)->xattr_sem);
+	return iloc.bh;
+}
+
+int ext4_delete_inline_entry(handle_t *handle,
+			     struct inode *dir,
+			     struct ext4_dir_entry_2 *de_del,
+			     struct buffer_head *bh,
+			     int *has_inline_data)
+{
+	int err, inline_size;
+	struct ext4_iloc iloc;
+	void *inline_start;
+
+	err = ext4_get_inode_loc(dir, &iloc);
+	if (err)
+		return err;
+
+	down_write(&EXT4_I(dir)->xattr_sem);
+	if (!ext4_has_inline_data(dir)) {
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
+		EXT4_MIN_INLINE_DATA_SIZE) {
+		inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+					EXT4_INLINE_DOTDOT_SIZE;
+		inline_size = EXT4_MIN_INLINE_DATA_SIZE -
+				EXT4_INLINE_DOTDOT_SIZE;
+	} else {
+		inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+		inline_size = ext4_get_inline_size(dir) -
+				EXT4_MIN_INLINE_DATA_SIZE;
+	}
+
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err)
+		goto out;
+
+	err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+					inline_start, inline_size, 0);
+	if (err)
+		goto out;
+
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	err = ext4_mark_inode_dirty(handle, dir);
+	if (unlikely(err))
+		goto out;
+
+	ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
+out:
+	up_write(&EXT4_I(dir)->xattr_sem);
+	brelse(iloc.bh);
+	if (err != -ENOENT)
+		ext4_std_error(dir->i_sb, err);
+	return err;
+}
+
+/*
+ * Get the inline dentry at offset.
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_get_inline_entry(struct inode *inode,
+		      struct ext4_iloc *iloc,
+		      unsigned int offset,
+		      void **inline_start,
+		      int *inline_size)
+{
+	void *inline_pos;
+
+	BUG_ON(offset > ext4_get_inline_size(inode));
+
+	if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
+		inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
+		*inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+	} else {
+		inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
+		offset -= EXT4_MIN_INLINE_DATA_SIZE;
+		*inline_size = ext4_get_inline_size(inode) -
+				EXT4_MIN_INLINE_DATA_SIZE;
+	}
+
+	if (inline_start)
+		*inline_start = inline_pos;
+	return (struct ext4_dir_entry_2 *)(inline_pos + offset);
+}
+
+int empty_inline_dir(struct inode *dir, int *has_inline_data)
+{
+	int err, inline_size;
+	struct ext4_iloc iloc;
+	void *inline_pos;
+	unsigned int offset;
+	struct ext4_dir_entry_2 *de;
+	int ret = 1;
+
+	err = ext4_get_inode_loc(dir, &iloc);
+	if (err) {
+		EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
+				 err, dir->i_ino);
+		return 1;
+	}
+
+	down_read(&EXT4_I(dir)->xattr_sem);
+	if (!ext4_has_inline_data(dir)) {
+		*has_inline_data = 0;
+		goto out;
+	}
+
+	de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+	if (!le32_to_cpu(de->inode)) {
+		ext4_warning(dir->i_sb,
+			     "bad inline directory (dir #%lu) - no `..'",
+			     dir->i_ino);
+		ret = 1;
+		goto out;
+	}
+
+	offset = EXT4_INLINE_DOTDOT_SIZE;
+	while (offset < dir->i_size) {
+		de = ext4_get_inline_entry(dir, &iloc, offset,
+					   &inline_pos, &inline_size);
+		if (ext4_check_dir_entry(dir, NULL, de,
+					 iloc.bh, inline_pos,
+					 inline_size, offset)) {
+			ext4_warning(dir->i_sb,
+				     "bad inline directory (dir #%lu) - "
+				     "inode %u, rec_len %u, name_len %d"
+				     "inline size %d\n",
+				     dir->i_ino, le32_to_cpu(de->inode),
+				     le16_to_cpu(de->rec_len), de->name_len,
+				     inline_size);
+			ret = 1;
+			goto out;
+		}
+		if (le32_to_cpu(de->inode)) {
+			ret = 0;
+			goto out;
+		}
+		offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
+	}
+
+out:
+	up_read(&EXT4_I(dir)->xattr_sem);
+	brelse(iloc.bh);
+	return ret;
+}
+
+int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
+{
+	int ret;
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	ret = ext4_destroy_inline_data_nolock(handle, inode);
+	up_write(&EXT4_I(inode)->xattr_sem);
+
+	return ret;
+}
+
+int ext4_inline_data_fiemap(struct inode *inode,
+			    struct fiemap_extent_info *fieinfo,
+			    int *has_inline)
+{
+	__u64 physical = 0;
+	__u64 length;
+	__u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+	int error = 0;
+	struct ext4_iloc iloc;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		*has_inline = 0;
+		goto out;
+	}
+
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		goto out;
+
+	physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+	physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+	physical += offsetof(struct ext4_inode, i_block);
+	length = i_size_read(inode);
+
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, 0, physical,
+						length, flags);
+	brelse(iloc.bh);
+out:
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return (error < 0 ? error : 0);
+}
+
+/*
+ * Called during xattr set, and if we can sparse space 'needed',
+ * just create the extent tree evict the data to the outer block.
+ *
+ * We use jbd2 instead of page cache to move data to the 1st block
+ * so that the whole transaction can be committed as a whole and
+ * the data isn't lost because of the delayed page cache write.
+ */
+int ext4_try_to_evict_inline_data(handle_t *handle,
+				  struct inode *inode,
+				  int needed)
+{
+	int error;
+	struct ext4_xattr_entry *entry;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+
+	raw_inode = ext4_raw_inode(&iloc);
+	entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+					    EXT4_I(inode)->i_inline_off);
+	if (EXT4_XATTR_LEN(entry->e_name_len) +
+	    EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
+		error = -ENOSPC;
+		goto out;
+	}
+
+	error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+out:
+	brelse(iloc.bh);
+	return error;
+}
+
+void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+{
+	handle_t *handle;
+	int inline_size, value_len, needed_blocks;
+	size_t i_size;
+	void *value = NULL;
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_info i = {
+		.name_index = EXT4_XATTR_INDEX_SYSTEM,
+		.name = EXT4_XATTR_SYSTEM_DATA,
+	};
+
+
+	needed_blocks = ext4_writepage_trans_blocks(inode);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
+	if (IS_ERR(handle))
+		return;
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		*has_inline = 0;
+		ext4_journal_stop(handle);
+		return;
+	}
+
+	if (ext4_orphan_add(handle, inode))
+		goto out;
+
+	if (ext4_get_inode_loc(inode, &is.iloc))
+		goto out;
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	i_size = inode->i_size;
+	inline_size = ext4_get_inline_size(inode);
+	EXT4_I(inode)->i_disksize = i_size;
+
+	if (i_size < inline_size) {
+		/* Clear the content in the xattr space. */
+		if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
+			if (ext4_xattr_ibody_find(inode, &i, &is))
+				goto out_error;
+
+			BUG_ON(is.s.not_found);
+
+			value_len = le32_to_cpu(is.s.here->e_value_size);
+			value = kmalloc(value_len, GFP_NOFS);
+			if (!value)
+				goto out_error;
+
+			if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
+						value, value_len))
+				goto out_error;
+
+			i.value = value;
+			i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
+					i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
+			if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+				goto out_error;
+		}
+
+		/* Clear the content within i_blocks. */
+		if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
+			void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
+			memset(p + i_size, 0,
+			       EXT4_MIN_INLINE_DATA_SIZE - i_size);
+		}
+
+		EXT4_I(inode)->i_inline_size = i_size <
+					EXT4_MIN_INLINE_DATA_SIZE ?
+					EXT4_MIN_INLINE_DATA_SIZE : i_size;
+	}
+
+out_error:
+	up_write(&EXT4_I(inode)->i_data_sem);
+out:
+	brelse(is.iloc.bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	kfree(value);
+	if (inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+
+	ext4_journal_stop(handle);
+	return;
+}
+
+int ext4_convert_inline_data(struct inode *inode)
+{
+	int error, needed_blocks;
+	handle_t *handle;
+	struct ext4_iloc iloc;
+
+	if (!ext4_has_inline_data(inode)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+		return 0;
+	}
+
+	needed_blocks = ext4_writepage_trans_blocks(inode);
+
+	iloc.bh = NULL;
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle)) {
+		error = PTR_ERR(handle);
+		goto out_free;
+	}
+
+	down_write(&EXT4_I(inode)->xattr_sem);
+	if (!ext4_has_inline_data(inode)) {
+		up_write(&EXT4_I(inode)->xattr_sem);
+		goto out;
+	}
+
+	error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+	up_write(&EXT4_I(inode)->xattr_sem);
+out:
+	ext4_journal_stop(handle);
+out_free:
+	brelse(iloc.bh);
+	return error;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 945cbf6cb1f..8a064734e6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -12,20 +12,14 @@
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
- *  Goal-directed block allocation by Stephen Tweedie
- *	(sct@redhat.com), 1993, 1998
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
  *	(jj@sunsite.ms.mff.cuni.cz)
  *
  *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
@@ -33,134 +27,131 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include <linux/mpage.h>
+#include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/aio.h>
+#include <linux/bitops.h>
+
+#include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "truncate.h"
 
-/*
- * Test whether an inode is a fast symlink.
- */
-static int ext4_inode_is_fast_symlink(struct inode *inode)
-{
-	int ea_blocks = EXT4_I(inode)->i_file_acl ?
-		(inode->i_sb->s_blocksize >> 9) : 0;
+#include <trace/events/ext4.h>
 
-	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
-}
+#define MPAGE_DA_EXTENT_TAIL 0x01
 
-/*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled.  Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
-			struct buffer_head *bh, ext4_fsblk_t blocknr)
+static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
+			      struct ext4_inode_info *ei)
 {
-	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	__u16 csum_lo;
+	__u16 csum_hi = 0;
+	__u32 csum;
+
+	csum_lo = le16_to_cpu(raw->i_checksum_lo);
+	raw->i_checksum_lo = 0;
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+		csum_hi = le16_to_cpu(raw->i_checksum_hi);
+		raw->i_checksum_hi = 0;
+	}
 
-	might_sleep();
+	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
+			   EXT4_INODE_SIZE(inode->i_sb));
 
-	BUFFER_TRACE(bh, "enter");
+	raw->i_checksum_lo = cpu_to_le16(csum_lo);
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+		raw->i_checksum_hi = cpu_to_le16(csum_hi);
 
-	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
-		  "data mode %lx\n",
-		  bh, is_metadata, inode->i_mode,
-		  test_opt(inode->i_sb, DATA_FLAGS));
+	return csum;
+}
 
-	/* Never use the revoke function if we are doing full data
-	 * journaling: there is no need to, and a V1 superblock won't
-	 * support it.  Otherwise, only skip the revoke on un-journaled
-	 * data blocks. */
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
+				  struct ext4_inode_info *ei)
+{
+	__u32 provided, calculated;
 
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-	    (!is_metadata && !ext4_should_journal_data(inode))) {
-		if (bh) {
-			BUFFER_TRACE(bh, "call jbd2_journal_forget");
-			return ext4_journal_forget(handle, bh);
-		}
-		return 0;
-	}
+	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+	    cpu_to_le32(EXT4_OS_LINUX) ||
+	    !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
+
+	provided = le16_to_cpu(raw->i_checksum_lo);
+	calculated = ext4_inode_csum(inode, raw, ei);
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+		provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
+	else
+		calculated &= 0xFFFF;
 
-	/*
-	 * data!=journal && (is_metadata || should_journal_data(inode))
-	 */
-	BUFFER_TRACE(bh, "call ext4_journal_revoke");
-	err = ext4_journal_revoke(handle, blocknr, bh);
-	if (err)
-		ext4_abort(inode->i_sb, __FUNCTION__,
-			   "error %d when attempting revoke", err);
-	BUFFER_TRACE(bh, "exit");
-	return err;
+	return provided == calculated;
 }
 
-/*
- * Work out how many blocks we need to proceed with the next chunk of a
- * truncate transaction.
- */
-static unsigned long blocks_for_truncate(struct inode *inode)
+static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+				struct ext4_inode_info *ei)
 {
-	ext4_lblk_t needed;
-
-	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+	__u32 csum;
 
-	/* Give ourselves just enough room to cope with inodes in which
-	 * i_blocks is corrupt: we've seen disk corruptions in the past
-	 * which resulted in random data in an inode which looked enough
-	 * like a regular file for ext4 to try to delete it.  Things
-	 * will go a bit crazy if that happens, but at least we should
-	 * try not to panic the whole kernel. */
-	if (needed < 2)
-		needed = 2;
-
-	/* But we need to bound the transaction so we don't overflow the
-	 * journal. */
-	if (needed > EXT4_MAX_TRANS_DATA)
-		needed = EXT4_MAX_TRANS_DATA;
+	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+	    cpu_to_le32(EXT4_OS_LINUX) ||
+	    !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return;
 
-	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+	csum = ext4_inode_csum(inode, raw, ei);
+	raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+		raw->i_checksum_hi = cpu_to_le16(csum >> 16);
 }
 
-/*
- * Truncate transactions can be complex and absolutely huge.  So we need to
- * be able to restart the transaction at a conventient checkpoint to make
- * sure we don't overflow the journal.
- *
- * start_transaction gets us a new handle for a truncate transaction,
- * and extend_transaction tries to extend the existing one a bit.  If
- * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct
- */
-static handle_t *start_transaction(struct inode *inode)
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+					      loff_t new_size)
 {
-	handle_t *result;
-
-	result = ext4_journal_start(inode, blocks_for_truncate(inode));
-	if (!IS_ERR(result))
-		return result;
-
-	ext4_std_error(inode->i_sb, PTR_ERR(result));
-	return result;
+	trace_ext4_begin_ordered_truncate(inode, new_size);
+	/*
+	 * If jinode is zero, then we never opened the file for
+	 * writing, so there's no need to call
+	 * jbd2_journal_begin_ordered_truncate() since there's no
+	 * outstanding writes we need to flush.
+	 */
+	if (!EXT4_I(inode)->jinode)
+		return 0;
+	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
+						   EXT4_I(inode)->jinode,
+						   new_size);
 }
 
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+				  int pextents);
+
 /*
- * Try to extend this transaction for the purposes of truncation.
- *
- * Returns 0 if we managed to create more room.  If we can't create more
- * room, and the transaction must be restarted we return 1.
+ * Test whether an inode is a fast symlink.
  */
-static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+static int ext4_inode_is_fast_symlink(struct inode *inode)
 {
-	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
-		return 0;
-	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
+        int ea_blocks = EXT4_I(inode)->i_file_acl ?
+		EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+	if (ext4_has_inline_data(inode))
 		return 0;
-	return 1;
+
+	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
 }
 
 /*
@@ -168,40 +159,134 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  * so before we call here everything must be consistently dirtied against
  * this transaction.
  */
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+				 int nblocks)
 {
+	int ret;
+
+	/*
+	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
+	 * moment, get_block can be called only for blocks inside i_size since
+	 * page cache has been already dropped and writes are blocked by
+	 * i_mutex. So we can safely drop the i_data_sem here.
+	 */
+	BUG_ON(EXT4_JOURNAL(inode) == NULL);
 	jbd_debug(2, "restarting handle %p\n", handle);
-	return ext4_journal_restart(handle, blocks_for_truncate(inode));
+	up_write(&EXT4_I(inode)->i_data_sem);
+	ret = ext4_journal_restart(handle, nblocks);
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
+
+	return ret;
 }
 
 /*
  * Called at the last iput() if i_nlink is zero.
  */
-void ext4_delete_inode (struct inode * inode)
+void ext4_evict_inode(struct inode *inode)
 {
 	handle_t *handle;
+	int err;
 
-	truncate_inode_pages(&inode->i_data, 0);
+	trace_ext4_evict_inode(inode);
+
+	if (inode->i_nlink) {
+		/*
+		 * When journalling data dirty buffers are tracked only in the
+		 * journal. So although mm thinks everything is clean and
+		 * ready for reaping the inode might still have some pages to
+		 * write in the running transaction or waiting to be
+		 * checkpointed. Thus calling jbd2_journal_invalidatepage()
+		 * (via truncate_inode_pages()) to discard these buffers can
+		 * cause data loss. Also even if we did not discard these
+		 * buffers, we would have no way to find them after the inode
+		 * is reaped and thus user could see stale data if he tries to
+		 * read them before the transaction is checkpointed. So be
+		 * careful and force everything to disk here... We use
+		 * ei->i_datasync_tid to store the newest transaction
+		 * containing inode's data.
+		 *
+		 * Note that directories do not have this problem because they
+		 * don't use page cache.
+		 */
+		if (ext4_should_journal_data(inode) &&
+		    (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) &&
+		    inode->i_ino != EXT4_JOURNAL_INO) {
+			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
+
+			jbd2_complete_transaction(journal, commit_tid);
+			filemap_write_and_wait(&inode->i_data);
+		}
+		truncate_inode_pages_final(&inode->i_data);
+
+		WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
+		goto no_delete;
+	}
+
+	if (!is_bad_inode(inode))
+		dquot_initialize(inode);
+
+	if (ext4_should_order_data(inode))
+		ext4_begin_ordered_truncate(inode, 0);
+	truncate_inode_pages_final(&inode->i_data);
 
+	WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
 	if (is_bad_inode(inode))
 		goto no_delete;
 
-	handle = start_transaction(inode);
+	/*
+	 * Protect us against freezing - iput() caller didn't have to have any
+	 * protection against it
+	 */
+	sb_start_intwrite(inode->i_sb);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+				    ext4_blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
+		ext4_std_error(inode->i_sb, PTR_ERR(handle));
 		/*
 		 * If we're going to skip the normal cleanup, we still need to
 		 * make sure that the in-core orphan linked list is properly
 		 * cleaned up.
 		 */
 		ext4_orphan_del(NULL, inode);
+		sb_end_intwrite(inode->i_sb);
 		goto no_delete;
 	}
 
 	if (IS_SYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 	inode->i_size = 0;
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext4_warning(inode->i_sb,
+			     "couldn't mark inode dirty (err %d)", err);
+		goto stop_handle;
+	}
 	if (inode->i_blocks)
 		ext4_truncate(inode);
+
+	/*
+	 * ext4_ext_truncate() doesn't reserve any slop when it
+	 * restarts journal transactions; therefore there may not be
+	 * enough credits left in the handle to remove the inode from
+	 * the orphan list and set the dtime field.
+	 */
+	if (!ext4_handle_has_enough_credits(handle, 3)) {
+		err = ext4_journal_extend(handle, 3);
+		if (err > 0)
+			err = ext4_journal_restart(handle, 3);
+		if (err != 0) {
+			ext4_warning(inode->i_sb,
+				     "couldn't extend journal (err %d)", err);
+		stop_handle:
+			ext4_journal_stop(handle);
+			ext4_orphan_del(NULL, inode);
+			sb_end_intwrite(inode->i_sb);
+			goto no_delete;
+		}
+	}
+
 	/*
 	 * Kill off the orphan record which ext4_truncate created.
 	 * AKPM: I think this can be inside the above `if'.
@@ -222,887 +307,543 @@ void ext4_delete_inode (struct inode * inode)
 	 */
 	if (ext4_mark_inode_dirty(handle, inode))
 		/* If that failed, just do the required in-core inode clear. */
-		clear_inode(inode);
+		ext4_clear_inode(inode);
 	else
 		ext4_free_inode(handle, inode);
 	ext4_journal_stop(handle);
+	sb_end_intwrite(inode->i_sb);
 	return;
 no_delete:
-	clear_inode(inode);	/* We must guarantee clearing of inode... */
+	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
 }
 
-typedef struct {
-	__le32	*p;
-	__le32	key;
-	struct buffer_head *bh;
-} Indirect;
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
 {
-	p->key = *(p->p = v);
-	p->bh = bh;
+	return &EXT4_I(inode)->i_reserved_quota;
 }
-
-/**
- *	ext4_block_to_path - parse the block number into array of offsets
- *	@inode: inode in question (we are only interested in its superblock)
- *	@i_block: block number to be parsed
- *	@offsets: array to store the offsets in
- *	@boundary: set this non-zero if the referred-to block is likely to be
- *	       followed (on disk) by an indirect block.
- *
- *	To store the locations of file's data ext4 uses a data structure common
- *	for UNIX filesystems - tree of pointers anchored in the inode, with
- *	data blocks at leaves and indirect blocks in intermediate nodes.
- *	This function translates the block number into path in that tree -
- *	return value is the path length and @offsets[n] is the offset of
- *	pointer to (n+1)th node in the nth one. If @block is out of range
- *	(negative or too large) warning is printed and zero returned.
- *
- *	Note: function doesn't find node addresses, so no IO is needed. All
- *	we need to know is the capacity of indirect blocks (taken from the
- *	inode->i_sb).
- */
+#endif
 
 /*
- * Portability note: the last comparison (check that we fit into triple
- * indirect block) is spelled differently, because otherwise on an
- * architecture with 32-bit longs and 8Kb pages we might get into trouble
- * if our filesystem had 8Kb blocks. We might use long long, but that would
- * kill us on x86. Oh, well, at least the sign propagation does not matter -
- * i_block would have to be negative in the very beginning, so we would not
- * get there at all.
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate a block located at @lblock
  */
-
-static int ext4_block_to_path(struct inode *inode,
-			ext4_lblk_t i_block,
-			ext4_lblk_t offsets[4], int *boundary)
-{
-	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
-	const long direct_blocks = EXT4_NDIR_BLOCKS,
-		indirect_blocks = ptrs,
-		double_blocks = (1 << (ptrs_bits * 2));
-	int n = 0;
-	int final = 0;
-
-	if (i_block < 0) {
-		ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
-	} else if (i_block < direct_blocks) {
-		offsets[n++] = i_block;
-		final = direct_blocks;
-	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
-		offsets[n++] = EXT4_IND_BLOCK;
-		offsets[n++] = i_block;
-		final = ptrs;
-	} else if ((i_block -= indirect_blocks) < double_blocks) {
-		offsets[n++] = EXT4_DIND_BLOCK;
-		offsets[n++] = i_block >> ptrs_bits;
-		offsets[n++] = i_block & (ptrs - 1);
-		final = ptrs;
-	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
-		offsets[n++] = EXT4_TIND_BLOCK;
-		offsets[n++] = i_block >> (ptrs_bits * 2);
-		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
-		offsets[n++] = i_block & (ptrs - 1);
-		final = ptrs;
-	} else {
-		ext4_warning(inode->i_sb, "ext4_block_to_path",
-				"block %lu > max",
-				i_block + direct_blocks +
-				indirect_blocks + double_blocks);
-	}
-	if (boundary)
-		*boundary = final - 1 - (i_block & (ptrs - 1));
-	return n;
-}
-
-/**
- *	ext4_get_branch - read the chain of indirect blocks leading to data
- *	@inode: inode in question
- *	@depth: depth of the chain (1 - direct pointer, etc.)
- *	@offsets: offsets of pointers in inode/indirect blocks
- *	@chain: place to store the result
- *	@err: here we store the error value
- *
- *	Function fills the array of triples <key, p, bh> and returns %NULL
- *	if everything went OK or the pointer to the last filled triple
- *	(incomplete one) otherwise. Upon the return chain[i].key contains
- *	the number of (i+1)-th block in the chain (as it is stored in memory,
- *	i.e. little-endian 32-bit), chain[i].p contains the address of that
- *	number (it points into struct inode for i==0 and into the bh->b_data
- *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
- *	block for i>0 and NULL for i==0. In other words, it holds the block
- *	numbers of the chain, addresses they were taken from (and where we can
- *	verify that chain did not change) and buffer_heads hosting these
- *	numbers.
- *
- *	Function stops when it stumbles upon zero pointer (absent block)
- *		(pointer to last triple returned, *@err == 0)
- *	or when it gets an IO error reading an indirect block
- *		(ditto, *@err == -EIO)
- *	or when it reads all @depth-1 indirect blocks successfully and finds
- *	the whole chain, all way to the data (returns %NULL, *err == 0).
- *
- *      Need to be called with
- *      down_read(&EXT4_I(inode)->i_data_sem)
- */
-static Indirect *ext4_get_branch(struct inode *inode, int depth,
-				 ext4_lblk_t  *offsets,
-				 Indirect chain[4], int *err)
+static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
 {
-	struct super_block *sb = inode->i_sb;
-	Indirect *p = chain;
-	struct buffer_head *bh;
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		return ext4_ext_calc_metadata_amount(inode, lblock);
 
-	*err = 0;
-	/* i_data is not going away, no lock needed */
-	add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
-	if (!p->key)
-		goto no_block;
-	while (--depth) {
-		bh = sb_bread(sb, le32_to_cpu(p->key));
-		if (!bh)
-			goto failure;
-		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
-		/* Reader: end */
-		if (!p->key)
-			goto no_block;
-	}
-	return NULL;
-
-failure:
-	*err = -EIO;
-no_block:
-	return p;
+	return ext4_ind_calc_metadata_amount(inode, lblock);
 }
 
-/**
- *	ext4_find_near - find a place for allocation with sufficient locality
- *	@inode: owner
- *	@ind: descriptor of indirect block.
- *
- *	This function returns the prefered place for block allocation.
- *	It is used when heuristic for sequential allocation fails.
- *	Rules are:
- *	  + if there is a block to the left of our position - allocate near it.
- *	  + if pointer will live in indirect block - allocate near that block.
- *	  + if pointer will live in inode - allocate in the same
- *	    cylinder group.
- *
- * In the latter case we colour the starting block by the callers PID to
- * prevent it from clashing with concurrent allocations for a different inode
- * in the same block group.   The PID is used here so that functionally related
- * files will be close-by on-disk.
- *
- *	Caller must make sure that @ind is valid and will stay that way.
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
  */
-static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+void ext4_da_update_reserve_space(struct inode *inode,
+					int used, int quota_claim)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
-	__le32 *p;
-	ext4_fsblk_t bg_start;
-	ext4_fsblk_t last_block;
-	ext4_grpblk_t colour;
 
-	/* Try to find previous block */
-	for (p = ind->p - 1; p >= start; p--) {
-		if (*p)
-			return le32_to_cpu(*p);
+	spin_lock(&ei->i_block_reservation_lock);
+	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
+	if (unlikely(used > ei->i_reserved_data_blocks)) {
+		ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
+			 "with only %d reserved data blocks",
+			 __func__, inode->i_ino, used,
+			 ei->i_reserved_data_blocks);
+		WARN_ON(1);
+		used = ei->i_reserved_data_blocks;
 	}
 
-	/* No such thing, so let's try location of indirect block */
-	if (ind->bh)
-		return ind->bh->b_blocknr;
-
-	/*
-	 * It is going to be referred to from the inode itself? OK, just put it
-	 * into the same cylinder group then.
-	 */
-	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
-	last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
-
-	if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
-		colour = (current->pid % 16) *
-			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
-	else
-		colour = (current->pid % 16) * ((last_block - bg_start) / 16);
-	return bg_start + colour;
-}
-
-/**
- *	ext4_find_goal - find a prefered place for allocation.
- *	@inode: owner
- *	@block:  block we want
- *	@partial: pointer to the last triple within a chain
- *
- *	Normally this function find the prefered place for block allocation,
- *	returns it.
- */
-static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
-		Indirect *partial)
-{
-	struct ext4_block_alloc_info *block_i;
-
-	block_i =  EXT4_I(inode)->i_block_alloc_info;
-
-	/*
-	 * try the heuristic for sequential allocation,
-	 * failing that at least try to get decent locality.
-	 */
-	if (block_i && (block == block_i->last_alloc_logical_block + 1)
-		&& (block_i->last_alloc_physical_block != 0)) {
-		return block_i->last_alloc_physical_block + 1;
+	if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
+		ext4_warning(inode->i_sb, "ino %lu, allocated %d "
+			"with only %d reserved metadata blocks "
+			"(releasing %d blocks with reserved %d data blocks)",
+			inode->i_ino, ei->i_allocated_meta_blocks,
+			     ei->i_reserved_meta_blocks, used,
+			     ei->i_reserved_data_blocks);
+		WARN_ON(1);
+		ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
 	}
 
-	return ext4_find_near(inode, partial);
-}
-
-/**
- *	ext4_blks_to_allocate: Look up the block map and count the number
- *	of direct blocks need to be allocated for the given branch.
- *
- *	@branch: chain of indirect blocks
- *	@k: number of blocks need for indirect blocks
- *	@blks: number of data blocks to be mapped.
- *	@blocks_to_boundary:  the offset in the indirect block
- *
- *	return the total number of blocks to be allocate, including the
- *	direct and indirect blocks.
- */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
-		int blocks_to_boundary)
-{
-	unsigned long count = 0;
+	/* Update per-inode reservations */
+	ei->i_reserved_data_blocks -= used;
+	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+	percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+			   used + ei->i_allocated_meta_blocks);
+	ei->i_allocated_meta_blocks = 0;
 
-	/*
-	 * Simple case, [t,d]Indirect block(s) has not allocated yet
-	 * then it's clear blocks on that path have not allocated
-	 */
-	if (k > 0) {
-		/* right now we don't handle cross boundary allocation */
-		if (blks < blocks_to_boundary + 1)
-			count += blks;
-		else
-			count += blocks_to_boundary + 1;
-		return count;
+	if (ei->i_reserved_data_blocks == 0) {
+		/*
+		 * We can release all of the reserved metadata blocks
+		 * only when we have written all of the delayed
+		 * allocation blocks.
+		 */
+		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+				   ei->i_reserved_meta_blocks);
+		ei->i_reserved_meta_blocks = 0;
+		ei->i_da_metadata_calc_len = 0;
 	}
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-	count++;
-	while (count < blks && count <= blocks_to_boundary &&
-		le32_to_cpu(*(branch[0].p + count)) == 0) {
-		count++;
+	/* Update quota subsystem for data blocks */
+	if (quota_claim)
+		dquot_claim_block(inode, EXT4_C2B(sbi, used));
+	else {
+		/*
+		 * We did fallocate with an offset that is already delayed
+		 * allocated. So on delayed allocated writeback we should
+		 * not re-claim the quota for fallocated blocks.
+		 */
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
 	}
-	return count;
-}
-
-/**
- *	ext4_alloc_blocks: multiple allocate blocks needed for a branch
- *	@indirect_blks: the number of blocks need to allocate for indirect
- *			blocks
- *
- *	@new_blocks: on return it will store the new block numbers for
- *	the indirect blocks(if needed) and the first direct block,
- *	@blks:	on return it will store the total number of allocated
- *		direct blocks
- */
-static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, int indirect_blks, int blks,
-			ext4_fsblk_t new_blocks[4], int *err)
-{
-	int target, i;
-	unsigned long count = 0;
-	int index = 0;
-	ext4_fsblk_t current_block = 0;
-	int ret = 0;
 
 	/*
-	 * Here we try to allocate the requested multiple blocks at once,
-	 * on a best-effort basis.
-	 * To build a branch, we should allocate blocks for
-	 * the indirect blocks(if not allocated yet), and at least
-	 * the first direct block of this branch.  That's the
-	 * minimum number of blocks need to allocate(required)
+	 * If we have done all the pending block allocations and if
+	 * there aren't any writers on the inode, we can discard the
+	 * inode's preallocations.
 	 */
-	target = blks + indirect_blks;
-
-	while (1) {
-		count = target;
-		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext4_new_blocks(handle,inode,goal,&count,err);
-		if (*err)
-			goto failed_out;
-
-		target -= count;
-		/* allocate blocks for indirect blocks */
-		while (index < indirect_blks && count) {
-			new_blocks[index++] = current_block++;
-			count--;
-		}
-
-		if (count > 0)
-			break;
-	}
-
-	/* save the new block number for the first direct block */
-	new_blocks[index] = current_block;
-
-	/* total number of blocks allocated for direct blocks */
-	ret = count;
-	*err = 0;
-	return ret;
-failed_out:
-	for (i = 0; i <index; i++)
-		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
-	return ret;
+	if ((ei->i_reserved_data_blocks == 0) &&
+	    (atomic_read(&inode->i_writecount) == 0))
+		ext4_discard_preallocations(inode);
 }
 
-/**
- *	ext4_alloc_branch - allocate and set up a chain of blocks.
- *	@inode: owner
- *	@indirect_blks: number of allocated indirect blocks
- *	@blks: number of allocated direct blocks
- *	@offsets: offsets (in the blocks) to store the pointers to next.
- *	@branch: place to store the chain in.
- *
- *	This function allocates blocks, zeroes out all but the last one,
- *	links them into chain and (if we are synchronous) writes them to disk.
- *	In other words, it prepares a branch that can be spliced onto the
- *	inode. It stores the information about that chain in the branch[], in
- *	the same format as ext4_get_branch() would do. We are calling it after
- *	we had read the existing part of chain and partial points to the last
- *	triple of that (one with zero ->key). Upon the exit we have the same
- *	picture as after the successful ext4_get_block(), except that in one
- *	place chain is disconnected - *branch->p is still zero (we did not
- *	set the last link), but branch->key contains the number that should
- *	be placed into *branch->p to fill that gap.
- *
- *	If allocation fails we free all blocks we've allocated (and forget
- *	their buffer_heads) and return the error value the from failed
- *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
- *	as described above and return 0.
- */
-static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, ext4_fsblk_t goal,
-			ext4_lblk_t *offsets, Indirect *branch)
+static int __check_block_validity(struct inode *inode, const char *func,
+				unsigned int line,
+				struct ext4_map_blocks *map)
 {
-	int blocksize = inode->i_sb->s_blocksize;
-	int i, n = 0;
-	int err = 0;
-	struct buffer_head *bh;
-	int num;
-	ext4_fsblk_t new_blocks[4];
-	ext4_fsblk_t current_block;
-
-	num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
-				*blks, new_blocks, &err);
-	if (err)
-		return err;
-
-	branch[0].key = cpu_to_le32(new_blocks[0]);
-	/*
-	 * metadata blocks and data blocks are allocated.
-	 */
-	for (n = 1; n <= indirect_blks;  n++) {
-		/*
-		 * Get buffer_head for parent block, zero it out
-		 * and set the pointer to new one, then send
-		 * parent to disk.
-		 */
-		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
-		branch[n].bh = bh;
-		lock_buffer(bh);
-		BUFFER_TRACE(bh, "call get_create_access");
-		err = ext4_journal_get_create_access(handle, bh);
-		if (err) {
-			unlock_buffer(bh);
-			brelse(bh);
-			goto failed;
-		}
-
-		memset(bh->b_data, 0, blocksize);
-		branch[n].p = (__le32 *) bh->b_data + offsets[n];
-		branch[n].key = cpu_to_le32(new_blocks[n]);
-		*branch[n].p = branch[n].key;
-		if ( n == indirect_blks) {
-			current_block = new_blocks[n];
-			/*
-			 * End of chain, update the last new metablock of
-			 * the chain to point to the new allocated
-			 * data blocks numbers
-			 */
-			for (i=1; i < num; i++)
-				*(branch[n].p + i) = cpu_to_le32(++current_block);
-		}
-		BUFFER_TRACE(bh, "marking uptodate");
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-
-		BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-		err = ext4_journal_dirty_metadata(handle, bh);
-		if (err)
-			goto failed;
-	}
-	*blks = num;
-	return err;
-failed:
-	/* Allocation failed, free what we already allocated */
-	for (i = 1; i <= n ; i++) {
-		BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
-		ext4_journal_forget(handle, branch[i].bh);
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+				   map->m_len)) {
+		ext4_error_inode(inode, func, line, map->m_pblk,
+				 "lblock %lu mapped to illegal pblock "
+				 "(length %d)", (unsigned long) map->m_lblk,
+				 map->m_len);
+		return -EIO;
 	}
-	for (i = 0; i <indirect_blks; i++)
-		ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
-
-	ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
-
-	return err;
+	return 0;
 }
 
-/**
- * ext4_splice_branch - splice the allocated branch onto inode.
- * @inode: owner
- * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *	ext4_alloc_branch)
- * @where: location of missing link
- * @num:   number of indirect blocks we are adding
- * @blks:  number of direct blocks we are adding
- *
- * This function fills the missing link and does all housekeeping needed in
- * inode (->i_blocks, etc.). In case of success we end up with the full
- * chain to new block and return 0.
- */
-static int ext4_splice_branch(handle_t *handle, struct inode *inode,
-			ext4_lblk_t block, Indirect *where, int num, int blks)
-{
-	int i;
-	int err = 0;
-	struct ext4_block_alloc_info *block_i;
-	ext4_fsblk_t current_block;
+#define check_block_validity(inode, map)	\
+	__check_block_validity((inode), __func__, __LINE__, (map))
 
-	block_i = EXT4_I(inode)->i_block_alloc_info;
-	/*
-	 * If we're splicing into a [td]indirect block (as opposed to the
-	 * inode) then we need to get write access to the [td]indirect block
-	 * before the splice.
-	 */
-	if (where->bh) {
-		BUFFER_TRACE(where->bh, "get_write_access");
-		err = ext4_journal_get_write_access(handle, where->bh);
-		if (err)
-			goto err_out;
-	}
-	/* That's it */
-
-	*where->p = where->key;
-
-	/*
-	 * Update the host buffer_head or inode to point to more just allocated
-	 * direct blocks blocks
-	 */
-	if (num == 0 && blks > 1) {
-		current_block = le32_to_cpu(where->key) + 1;
-		for (i = 1; i < blks; i++)
-			*(where->p + i ) = cpu_to_le32(current_block++);
-	}
+#ifdef ES_AGGRESSIVE_TEST
+static void ext4_map_blocks_es_recheck(handle_t *handle,
+				       struct inode *inode,
+				       struct ext4_map_blocks *es_map,
+				       struct ext4_map_blocks *map,
+				       int flags)
+{
+	int retval;
 
+	map->m_flags = 0;
 	/*
-	 * update the most recently allocated logical & physical block
-	 * in i_block_alloc_info, to assist find the proper goal block for next
-	 * allocation
+	 * There is a race window that the result is not the same.
+	 * e.g. xfstests #223 when dioread_nolock enables.  The reason
+	 * is that we lookup a block mapping in extent status tree with
+	 * out taking i_data_sem.  So at the time the unwritten extent
+	 * could be converted.
 	 */
-	if (block_i) {
-		block_i->last_alloc_logical_block = block + blks - 1;
-		block_i->last_alloc_physical_block =
-				le32_to_cpu(where[num].key) + blks - 1;
-	}
-
-	/* We are done with atomic stuff, now do the rest of housekeeping */
-
-	inode->i_ctime = ext4_current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
-
-	/* had we spliced it onto indirect block? */
-	if (where->bh) {
-		/*
-		 * If we spliced it onto an indirect block, we haven't
-		 * altered the inode.  Note however that if it is being spliced
-		 * onto an indirect block at the very end of the file (the
-		 * file is growing) then we *will* alter the inode to reflect
-		 * the new i_size.  But that is not done here - it is done in
-		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
-		 */
-		jbd_debug(5, "splicing indirect only\n");
-		BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
-		err = ext4_journal_dirty_metadata(handle, where->bh);
-		if (err)
-			goto err_out;
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read(&EXT4_I(inode)->i_data_sem);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	} else {
-		/*
-		 * OK, we spliced it into the inode itself on a direct block.
-		 * Inode was dirtied above.
-		 */
-		jbd_debug(5, "splicing direct\n");
+		retval = ext4_ind_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
-	return err;
-
-err_out:
-	for (i = 1; i <= num; i++) {
-		BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
-		ext4_journal_forget(handle, where[i].bh);
-		ext4_free_blocks(handle, inode,
-					le32_to_cpu(where[i-1].key), 1, 0);
-	}
-	ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
-
-	return err;
-}
-
-/*
- * Allocation strategy is simple: if we have to allocate something, we will
- * have to go the whole way to leaf. So let's do it before attaching anything
- * to tree, set linkage between the newborn blocks, write them if sync is
- * required, recheck the path, free and repeat if check fails, otherwise
- * set the last missing link (that will protect us from any truncate-generated
- * removals - all blocks on the path are immune now) and possibly force the
- * write on the parent block.
- * That has a nice additional property: no special recovery from the failed
- * allocations is needed - we simply release blocks and do not touch anything
- * reachable from inode.
- *
- * `handle' can be NULL if create == 0.
- *
- * return > 0, # of blocks mapped or allocated.
- * return = 0, if plain lookup failed.
- * return < 0, error case.
- *
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
- */
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-		ext4_lblk_t iblock, unsigned long maxblocks,
-		struct buffer_head *bh_result,
-		int create, int extend_disksize)
-{
-	int err = -EIO;
-	ext4_lblk_t offsets[4];
-	Indirect chain[4];
-	Indirect *partial;
-	ext4_fsblk_t goal;
-	int indirect_blks;
-	int blocks_to_boundary = 0;
-	int depth;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	int count = 0;
-	ext4_fsblk_t first_block = 0;
-
-
-	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
-	J_ASSERT(handle != NULL || create == 0);
-	depth = ext4_block_to_path(inode, iblock, offsets,
-					&blocks_to_boundary);
-
-	if (depth == 0)
-		goto out;
-
-	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
-
-	/* Simplest case - block found, no allocation needed */
-	if (!partial) {
-		first_block = le32_to_cpu(chain[depth - 1].key);
-		clear_buffer_new(bh_result);
-		count++;
-		/*map more blocks*/
-		while (count < maxblocks && count <= blocks_to_boundary) {
-			ext4_fsblk_t blk;
-
-			blk = le32_to_cpu(*(chain[depth-1].p + count));
-
-			if (blk == first_block + count)
-				count++;
-			else
-				break;
-		}
-		goto got_it;
-	}
-
-	/* Next simple case - plain lookup or failed read of indirect block */
-	if (!create || err == -EIO)
-		goto cleanup;
-
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
 	/*
-	 * Okay, we need to do block allocation.  Lazily initialize the block
-	 * allocation info here if necessary
-	*/
-	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
-		ext4_init_block_alloc_info(inode);
-
-	goal = ext4_find_goal(inode, iblock, partial);
-
-	/* the number of blocks need to allocate for [d,t]indirect blocks */
-	indirect_blks = (chain + depth) - partial - 1;
-
-	/*
-	 * Next look up the indirect map to count the totoal number of
-	 * direct blocks to allocate for this branch.
+	 * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag
+	 * because it shouldn't be marked in es_map->m_flags.
 	 */
-	count = ext4_blks_to_allocate(partial, indirect_blks,
-					maxblocks, blocks_to_boundary);
-	/*
-	 * Block out ext4_truncate while we alter the tree
-	 */
-	err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
-				offsets + (partial - chain), partial);
+	map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY);
 
 	/*
-	 * The ext4_splice_branch call will free and forget any buffers
-	 * on the new chain if there is a failure, but that risks using
-	 * up transaction credits, especially for bitmaps where the
-	 * credits cannot be returned.  Can we handle this somehow?  We
-	 * may need to return -EAGAIN upwards in the worst case.  --sct
+	 * We don't check m_len because extent will be collpased in status
+	 * tree.  So the m_len might not equal.
 	 */
-	if (!err)
-		err = ext4_splice_branch(handle, inode, iblock,
-					partial, indirect_blks, count);
-	/*
-	 * i_disksize growing is protected by i_data_sem.  Don't forget to
-	 * protect it if you're about to implement concurrent
-	 * ext4_get_block() -bzzz
-	*/
-	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-		ei->i_disksize = inode->i_size;
-	if (err)
-		goto cleanup;
-
-	set_buffer_new(bh_result);
-got_it:
-	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
-	if (count > blocks_to_boundary)
-		set_buffer_boundary(bh_result);
-	err = count;
-	/* Clean up and exit */
-	partial = chain + depth - 1;	/* the whole chain */
-cleanup:
-	while (partial > chain) {
-		BUFFER_TRACE(partial->bh, "call brelse");
-		brelse(partial->bh);
-		partial--;
-	}
-	BUFFER_TRACE(bh_result, "returned");
-out:
-	return err;
+	if (es_map->m_lblk != map->m_lblk ||
+	    es_map->m_flags != map->m_flags ||
+	    es_map->m_pblk != map->m_pblk) {
+		printk("ES cache assertion failed for inode: %lu "
+		       "es_cached ex [%d/%d/%llu/%x] != "
+		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
+		       inode->i_ino, es_map->m_lblk, es_map->m_len,
+		       es_map->m_pblk, es_map->m_flags, map->m_lblk,
+		       map->m_len, map->m_pblk, map->m_flags,
+		       retval, flags);
+	}
 }
+#endif /* ES_AGGRESSIVE_TEST */
 
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
 /*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
- * We need sb + group descriptor + bitmap + inode -> 4
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
- */
-#define DIO_CREDITS 25
-
-
-/*
- *
+ * The ext4_map_blocks() function tries to look up the requested blocks,
+ * and returns if the blocks are already mapped.
  *
- * ext4_ext4 get_block() wrapper function
- * It will do a look up first, and returns if the blocks already mapped.
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  * and store the allocated blocks in the result buffer head and mark it
  * mapped.
  *
- * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
  * based files
  *
- * On success, it returns the number of blocks being mapped or allocate.
- * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * On success, it returns the number of blocks being mapped or allocated.
+ * if create==0 and the blocks are pre-allocated and unwritten block,
  * the result buffer head is unmapped. If the create ==1, it will make sure
  * the buffer head is mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
  *
  * It returns the error in case of allocation failure.
  */
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-			unsigned long max_blocks, struct buffer_head *bh,
-			int create, int extend_disksize)
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+		    struct ext4_map_blocks *map, int flags)
 {
+	struct extent_status es;
 	int retval;
+	int ret = 0;
+#ifdef ES_AGGRESSIVE_TEST
+	struct ext4_map_blocks orig_map;
+
+	memcpy(&orig_map, map, sizeof(*map));
+#endif
 
-	clear_buffer_mapped(bh);
+	map->m_flags = 0;
+	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
+		  (unsigned long) map->m_lblk);
 
 	/*
-	 * Try to see if we can get  the block without requesting
-	 * for new file system block.
+	 * ext4_map_blocks returns an int, and m_len is an unsigned int
 	 */
-	down_read((&EXT4_I(inode)->i_data_sem));
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, 0, 0);
+	if (unlikely(map->m_len > INT_MAX))
+		map->m_len = INT_MAX;
+
+	/* We can handle the block number less than EXT_MAX_BLOCKS */
+	if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+		return -EIO;
+
+	/* Lookup extent status tree firstly */
+	if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+		ext4_es_lru_add(inode);
+		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+			map->m_pblk = ext4_es_pblock(&es) +
+					map->m_lblk - es.es_lblk;
+			map->m_flags |= ext4_es_is_written(&es) ?
+					EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+			retval = es.es_len - (map->m_lblk - es.es_lblk);
+			if (retval > map->m_len)
+				retval = map->m_len;
+			map->m_len = retval;
+		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+			retval = 0;
+		} else {
+			BUG_ON(1);
+		}
+#ifdef ES_AGGRESSIVE_TEST
+		ext4_map_blocks_es_recheck(handle, inode, map,
+					   &orig_map, flags);
+#endif
+		goto found;
+	}
+
+	/*
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
+	 */
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		down_read(&EXT4_I(inode)->i_data_sem);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	} else {
-		retval = ext4_get_blocks_handle(handle,
-				inode, block, max_blocks, bh, 0, 0);
+		retval = ext4_ind_map_blocks(handle, inode, map, flags &
+					     EXT4_GET_BLOCKS_KEEP_SIZE);
+	}
+	if (retval > 0) {
+		unsigned int status;
+
+		if (unlikely(retval != map->m_len)) {
+			ext4_warning(inode->i_sb,
+				     "ES len assertion failed for inode "
+				     "%lu: retval %d != map->m_len %d",
+				     inode->i_ino, retval, map->m_len);
+			WARN_ON(1);
+		}
+
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+		    ext4_find_delalloc_range(inode, map->m_lblk,
+					     map->m_lblk + map->m_len - 1))
+			status |= EXTENT_STATUS_DELAYED;
+		ret = ext4_es_insert_extent(inode, map->m_lblk,
+					    map->m_len, map->m_pblk, status);
+		if (ret < 0)
+			retval = ret;
+	}
+	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+		up_read((&EXT4_I(inode)->i_data_sem));
+
+found:
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		ret = check_block_validity(inode, map);
+		if (ret != 0)
+			return ret;
 	}
-	up_read((&EXT4_I(inode)->i_data_sem));
 
 	/* If it is only a block(s) look up */
-	if (!create)
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
 
 	/*
 	 * Returns if the blocks have already allocated
 	 *
 	 * Note that if blocks have been preallocated
-	 * ext4_ext_get_block() returns th create = 0
+	 * ext4_ext_get_block() returns the create = 0
 	 * with buffer head unmapped.
 	 */
-	if (retval > 0 && buffer_mapped(bh))
-		return retval;
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+		/*
+		 * If we need to convert extent to unwritten
+		 * we continue and do the actual work in
+		 * ext4_ext_map_blocks()
+		 */
+		if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+			return retval;
 
 	/*
-	 * New blocks allocate and/or writing to uninitialized extent
+	 * Here we clear m_flags because after allocating an new extent,
+	 * it will be set again.
+	 */
+	map->m_flags &= ~EXT4_MAP_FLAGS;
+
+	/*
+	 * New blocks allocate and/or writing to unwritten extent
 	 * will possibly result in updating i_data, so we take
 	 * the write lock of i_data_sem, and call get_blocks()
 	 * with create == 1 flag.
 	 */
-	down_write((&EXT4_I(inode)->i_data_sem));
+	down_write(&EXT4_I(inode)->i_data_sem);
+
+	/*
+	 * if the caller is from delayed allocation writeout path
+	 * we have already reserved fs blocks for allocation
+	 * let the underlying get_block() function know to
+	 * avoid double accounting
+	 */
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
 	 */
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, create, extend_disksize);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	} else {
-		retval = ext4_get_blocks_handle(handle, inode, block,
-				max_blocks, bh, create, extend_disksize);
+		retval = ext4_ind_map_blocks(handle, inode, map, flags);
+
+		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
+			/*
+			 * We allocated new blocks which will result in
+			 * i_data's format changing.  Force the migrate
+			 * to fail by clearing migrate flags
+			 */
+			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+		}
+
+		/*
+		 * Update reserved blocks/metadata blocks after successful
+		 * block allocation which had been deferred till now. We don't
+		 * support fallocate for non extent files. So we can update
+		 * reserve space here.
+		 */
+		if ((retval > 0) &&
+			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+			ext4_da_update_reserve_space(inode, retval, 1);
 	}
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+		ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+
+	if (retval > 0) {
+		unsigned int status;
+
+		if (unlikely(retval != map->m_len)) {
+			ext4_warning(inode->i_sb,
+				     "ES len assertion failed for inode "
+				     "%lu: retval %d != map->m_len %d",
+				     inode->i_ino, retval, map->m_len);
+			WARN_ON(1);
+		}
+
+		/*
+		 * If the extent has been zeroed out, we don't need to update
+		 * extent status tree.
+		 */
+		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
+		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
+			if (ext4_es_is_written(&es))
+				goto has_zeroout;
+		}
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
+		    ext4_find_delalloc_range(inode, map->m_lblk,
+					     map->m_lblk + map->m_len - 1))
+			status |= EXTENT_STATUS_DELAYED;
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    map->m_pblk, status);
+		if (ret < 0)
+			retval = ret;
+	}
+
+has_zeroout:
 	up_write((&EXT4_I(inode)->i_data_sem));
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+		ret = check_block_validity(inode, map);
+		if (ret != 0)
+			return ret;
+	}
 	return retval;
 }
 
-static int ext4_get_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int flags)
 {
 	handle_t *handle = ext4_journal_current_handle();
+	struct ext4_map_blocks map;
 	int ret = 0, started = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int dio_credits;
+
+	if (ext4_has_inline_data(inode))
+		return -ERANGE;
+
+	map.m_lblk = iblock;
+	map.m_len = bh->b_size >> inode->i_blkbits;
 
-	if (create && !handle) {
+	if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
 		/* Direct IO write... */
-		if (max_blocks > DIO_MAX_BLOCKS)
-			max_blocks = DIO_MAX_BLOCKS;
-		handle = ext4_journal_start(inode, DIO_CREDITS +
-			      2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+		if (map.m_len > DIO_MAX_BLOCKS)
+			map.m_len = DIO_MAX_BLOCKS;
+		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+					    dio_credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
-			goto out;
+			return ret;
 		}
 		started = 1;
 	}
 
-	ret = ext4_get_blocks_wrap(handle, inode, iblock,
-					max_blocks, bh_result, create, 0);
+	ret = ext4_map_blocks(handle, inode, &map, flags);
 	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
+		ext4_io_end_t *io_end = ext4_inode_aio(inode);
+
+		map_bh(bh, inode->i_sb, map.m_pblk);
+		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+			set_buffer_defer_completion(bh);
+		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 		ret = 0;
 	}
 	if (started)
 		ext4_journal_stop(handle);
-out:
 	return ret;
 }
 
+int ext4_get_block(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh, int create)
+{
+	return _ext4_get_block(inode, iblock, bh,
+			       create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
 /*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				ext4_lblk_t block, int create, int *errp)
 {
-	struct buffer_head dummy;
+	struct ext4_map_blocks map;
+	struct buffer_head *bh;
 	int fatal = 0, err;
 
 	J_ASSERT(handle != NULL || create == 0);
 
-	dummy.b_state = 0;
-	dummy.b_blocknr = -1000;
-	buffer_trace_init(&dummy.b_history);
-	err = ext4_get_blocks_wrap(handle, inode, block, 1,
-					&dummy, create, 1);
-	/*
-	 * ext4_get_blocks_handle() returns number of blocks
-	 * mapped. 0 in case of a HOLE.
-	 */
-	if (err > 0) {
-		if (err > 1)
-			WARN_ON(1);
-		err = 0;
+	map.m_lblk = block;
+	map.m_len = 1;
+	err = ext4_map_blocks(handle, inode, &map,
+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+	/* ensure we send some value back into *errp */
+	*errp = 0;
+
+	if (create && err == 0)
+		err = -ENOSPC;	/* should never happen */
+	if (err < 0)
+		*errp = err;
+	if (err <= 0)
+		return NULL;
+
+	bh = sb_getblk(inode->i_sb, map.m_pblk);
+	if (unlikely(!bh)) {
+		*errp = -ENOMEM;
+		return NULL;
 	}
-	*errp = err;
-	if (!err && buffer_mapped(&dummy)) {
-		struct buffer_head *bh;
-		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-		if (!bh) {
-			*errp = -EIO;
-			goto err;
-		}
-		if (buffer_new(&dummy)) {
-			J_ASSERT(create != 0);
-			J_ASSERT(handle != NULL);
+	if (map.m_flags & EXT4_MAP_NEW) {
+		J_ASSERT(create != 0);
+		J_ASSERT(handle != NULL);
 
-			/*
-			 * Now that we do not always journal data, we should
-			 * keep in mind whether this should always journal the
-			 * new buffer as metadata.  For now, regular file
-			 * writes use ext4_get_block instead, so it's not a
-			 * problem.
-			 */
-			lock_buffer(bh);
-			BUFFER_TRACE(bh, "call get_create_access");
-			fatal = ext4_journal_get_create_access(handle, bh);
-			if (!fatal && !buffer_uptodate(bh)) {
-				memset(bh->b_data,0,inode->i_sb->s_blocksize);
-				set_buffer_uptodate(bh);
-			}
-			unlock_buffer(bh);
-			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-			err = ext4_journal_dirty_metadata(handle, bh);
-			if (!fatal)
-				fatal = err;
-		} else {
-			BUFFER_TRACE(bh, "not a new buffer");
-		}
-		if (fatal) {
-			*errp = fatal;
-			brelse(bh);
-			bh = NULL;
+		/*
+		 * Now that we do not always journal data, we should
+		 * keep in mind whether this should always journal the
+		 * new buffer as metadata.  For now, regular file
+		 * writes use ext4_get_block instead, so it's not a
+		 * problem.
+		 */
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		fatal = ext4_journal_get_create_access(handle, bh);
+		if (!fatal && !buffer_uptodate(bh)) {
+			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+			set_buffer_uptodate(bh);
 		}
-		return bh;
+		unlock_buffer(bh);
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (!fatal)
+			fatal = err;
+	} else {
+		BUFFER_TRACE(bh, "not a new buffer");
 	}
-err:
-	return NULL;
+	if (fatal) {
+		*errp = fatal;
+		brelse(bh);
+		bh = NULL;
+	}
+	return bh;
 }
 
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t block, int create, int *err)
 {
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 
 	bh = ext4_getblk(handle, inode, block, create, err);
 	if (!bh)
 		return bh;
 	if (buffer_uptodate(bh))
 		return bh;
-	ll_rw_block(READ_META, 1, &bh);
+	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
@@ -1111,13 +852,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
 	return NULL;
 }
 
-static int walk_page_buffers(	handle_t *handle,
-				struct buffer_head *head,
-				unsigned from,
-				unsigned to,
-				int *partial,
-				int (*fn)(	handle_t *handle,
-						struct buffer_head *bh))
+int ext4_walk_page_buffers(handle_t *handle,
+			   struct buffer_head *head,
+			   unsigned from,
+			   unsigned to,
+			   int *partial,
+			   int (*fn)(handle_t *handle,
+				     struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
@@ -1125,10 +866,9 @@ static int walk_page_buffers(	handle_t *handle,
 	int err, ret = 0;
 	struct buffer_head *next;
 
-	for (	bh = head, block_start = 0;
-		ret == 0 && (bh != head || !block_start);
-		block_start = block_end, bh = next)
-	{
+	for (bh = head, block_start = 0;
+	     ret == 0 && (bh != head || !block_start);
+	     block_start = block_end, bh = next) {
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
 		if (block_end <= from || block_start >= to) {
@@ -1150,11 +890,10 @@ static int walk_page_buffers(	handle_t *handle,
  * and the commit_write().  So doing the jbd2_journal_start at the start of
  * prepare_write() is the right place.
  *
- * Also, this function can nest inside ext4_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
- * has generated enough buffer credits to do the whole page.  So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
+ * Also, this function can nest inside ext4_writepage().  In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page.  So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
  *
  * By accident, ext4 can be reentered when a transaction is open via
  * quota file writes.  If we were to commit the transaction while thus
@@ -1168,103 +907,153 @@ static int walk_page_buffers(	handle_t *handle,
  * is elevated.  We'll still have enough credits for the tiny quotafile
  * write.
  */
-static int do_journal_get_write_access(handle_t *handle,
-					struct buffer_head *bh)
+int do_journal_get_write_access(handle_t *handle,
+				struct buffer_head *bh)
 {
+	int dirty = buffer_dirty(bh);
+	int ret;
+
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
-	return ext4_journal_get_write_access(handle, bh);
+	/*
+	 * __block_write_begin() could have dirtied some buffers. Clean
+	 * the dirty bit as jbd2_journal_get_write_access() could complain
+	 * otherwise about fs integrity issues. Setting of the dirty bit
+	 * by __block_write_begin() isn't a real problem here as we clear
+	 * the bit before releasing a page lock and thus writeback cannot
+	 * ever write the buffer.
+	 */
+	if (dirty)
+		clear_buffer_dirty(bh);
+	BUFFER_TRACE(bh, "get write access");
+	ret = ext4_journal_get_write_access(handle, bh);
+	if (!ret && dirty)
+		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+	return ret;
 }
 
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create);
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned flags,
-				struct page **pagep, void **fsdata)
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
 {
- 	struct inode *inode = mapping->host;
-	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	struct inode *inode = mapping->host;
+	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
- 	struct page *page;
- 	pgoff_t index;
- 	unsigned from, to;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
 
- 	index = pos >> PAGE_CACHE_SHIFT;
- 	from = pos & (PAGE_CACHE_SIZE - 1);
- 	to = from + len;
+	trace_ext4_write_begin(inode, pos, len, flags);
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason
+	 */
+	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
 
-retry:
- 	page = __grab_cache_page(mapping, index);
- 	if (!page)
- 		return -ENOMEM;
- 	*pagep = page;
+	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+		ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+						    flags, pagep);
+		if (ret < 0)
+			return ret;
+		if (ret == 1)
+			return 0;
+	}
 
-  	handle = ext4_journal_start(inode, needed_blocks);
-  	if (IS_ERR(handle)) {
- 		unlock_page(page);
- 		page_cache_release(page);
-  		ret = PTR_ERR(handle);
-  		goto out;
+	/*
+	 * grab_cache_page_write_begin() can take a long time if the
+	 * system is thrashing due to memory pressure, or if the page
+	 * is being written back.  So grab it first before we start
+	 * the transaction handle.  This also allows us to allocate
+	 * the page (if needed) without using GFP_NOFS.
+	 */
+retry_grab:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
+	unlock_page(page);
+
+retry_journal:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle)) {
+		page_cache_release(page);
+		return PTR_ERR(handle);
 	}
 
-	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext4_get_block);
+	lock_page(page);
+	if (page->mapping != mapping) {
+		/* The page got truncated from under us */
+		unlock_page(page);
+		page_cache_release(page);
+		ext4_journal_stop(handle);
+		goto retry_grab;
+	}
+	/* In case writeback began while the page was unlocked */
+	wait_for_stable_page(page);
+
+	if (ext4_should_dioread_nolock(inode))
+		ret = __block_write_begin(page, pos, len, ext4_get_block_write);
+	else
+		ret = __block_write_begin(page, pos, len, ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
-		ret = walk_page_buffers(handle, page_buffers(page),
-				from, to, NULL, do_journal_get_write_access);
+		ret = ext4_walk_page_buffers(handle, page_buffers(page),
+					     from, to, NULL,
+					     do_journal_get_write_access);
 	}
 
 	if (ret) {
+		unlock_page(page);
+		/*
+		 * __block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 *
+		 * Add inode to orphan list in case we crash before
+		 * truncate finishes
+		 */
+		if (pos + len > inode->i_size && ext4_can_truncate(inode))
+			ext4_orphan_add(handle, inode);
+
 		ext4_journal_stop(handle);
- 		unlock_page(page);
- 		page_cache_release(page);
-	}
+		if (pos + len > inode->i_size) {
+			ext4_truncate_failed_write(inode);
+			/*
+			 * If truncate failed early the inode might
+			 * still be on the orphan list; we need to
+			 * make sure the inode is removed from the
+			 * orphan list in that case.
+			 */
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+		}
 
-	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-out:
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_journal;
+		page_cache_release(page);
+		return ret;
+	}
+	*pagep = page;
 	return ret;
 }
 
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	int err = jbd2_journal_dirty_data(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
-						bh, handle, err);
-	return err;
-}
-
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
+	int ret;
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	set_buffer_uptodate(bh);
-	return ext4_journal_dirty_metadata(handle, bh);
-}
-
-/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	struct inode *inode = file->f_mapping->host;
-
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
-	}
-
-	return copied;
+	ret = ext4_handle_dirty_metadata(handle, NULL, bh);
+	clear_buffer_meta(bh);
+	clear_buffer_prio(bh);
+	return ret;
 }
 
 /*
@@ -1274,168 +1063,660 @@ static int ext4_generic_write_end(struct file *file,
  * ext4 never places buffers on inode->i_mapping->private_list.  metadata
  * buffers are managed internally.
  */
-static int ext4_ordered_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+static int ext4_write_end(struct file *file,
+			  struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
-	unsigned from, to;
+	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
+	int i_size_changed = 0;
 
-	from = pos & (PAGE_CACHE_SIZE - 1);
-	to = from + len;
+	trace_ext4_write_end(inode, pos, len, copied);
+	if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) {
+		ret = ext4_jbd2_file_inode(handle, inode);
+		if (ret) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto errout;
+		}
+	}
 
-	ret = walk_page_buffers(handle, page_buffers(page),
-		from, to, NULL, ext4_journal_dirty_data);
+	if (ext4_has_inline_data(inode)) {
+		ret = ext4_write_inline_data_end(inode, pos, len,
+						 copied, page);
+		if (ret < 0)
+			goto errout;
+		copied = ret;
+	} else
+		copied = block_write_end(file, mapping, pos,
+					 len, copied, page, fsdata);
 
-	if (ret == 0) {
-		/*
-		 * generic_write_end() will run mark_inode_dirty() if i_size
-		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
-		 * into that.
-		 */
-		loff_t new_i_size;
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hole i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos + copied > inode->i_size) {
+		i_size_write(inode, pos + copied);
+		i_size_changed = 1;
+	}
 
-		new_i_size = pos + copied;
-		if (new_i_size > EXT4_I(inode)->i_disksize)
-			EXT4_I(inode)->i_disksize = new_i_size;
-		copied = ext4_generic_write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-		if (copied < 0)
-			ret = copied;
+	if (pos + copied > EXT4_I(inode)->i_disksize) {
+		/* We need to mark inode dirty even if
+		 * new_i_size is less that inode->i_size
+		 * but greater than i_disksize. (hint delalloc)
+		 */
+		ext4_update_i_disksize(inode, (pos + copied));
+		i_size_changed = 1;
 	}
-	ret2 = ext4_journal_stop(handle);
-	if (!ret)
-		ret = ret2;
 	unlock_page(page);
 	page_cache_release(page);
 
-	return ret ? ret : copied;
-}
-
-static int ext4_writeback_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
-	int ret = 0, ret2;
-	loff_t new_i_size;
-
-	new_i_size = pos + copied;
-	if (new_i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = new_i_size;
-
-	copied = ext4_generic_write_end(file, mapping, pos, len, copied,
-							page, fsdata);
-	if (copied < 0)
-		ret = copied;
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		ext4_mark_inode_dirty(handle, inode);
 
+	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+errout:
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
+
+	if (pos + len > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
 
 	return ret ? ret : copied;
 }
 
 static int ext4_journalled_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
+				     struct address_space *mapping,
+				     loff_t pos, unsigned len, unsigned copied,
+				     struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	int partial = 0;
 	unsigned from, to;
+	loff_t new_i_size;
 
+	trace_ext4_journalled_write_end(inode, pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
-	if (copied < len) {
-		if (!PageUptodate(page))
-			copied = 0;
-		page_zero_new_buffers(page, from+copied, to);
-	}
+	BUG_ON(!ext4_handle_valid(handle));
 
-	ret = walk_page_buffers(handle, page_buffers(page), from,
-				to, &partial, write_end_fn);
-	if (!partial)
-		SetPageUptodate(page);
-	if (pos+copied > inode->i_size)
+	if (ext4_has_inline_data(inode))
+		copied = ext4_write_inline_data_end(inode, pos, len,
+						    copied, page);
+	else {
+		if (copied < len) {
+			if (!PageUptodate(page))
+				copied = 0;
+			page_zero_new_buffers(page, from+copied, to);
+		}
+
+		ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+					     to, &partial, write_end_fn);
+		if (!partial)
+			SetPageUptodate(page);
+	}
+	new_i_size = pos + copied;
+	if (new_i_size > inode->i_size)
 		i_size_write(inode, pos+copied);
-	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-	if (inode->i_size > EXT4_I(inode)->i_disksize) {
-		EXT4_I(inode)->i_disksize = inode->i_size;
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		ext4_update_i_disksize(inode, new_i_size);
 		ret2 = ext4_mark_inode_dirty(handle, inode);
 		if (!ret)
 			ret = ret2;
 	}
 
+	unlock_page(page);
+	page_cache_release(page);
+	if (pos + len > inode->i_size && ext4_can_truncate(inode))
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
+	if (pos + len > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
 
 	return ret ? ret : copied;
 }
 
 /*
- * bmap() is special.  It gets used by applications such as lilo and by
- * the swapper to find the on-disk block of a specific piece of data.
- *
- * Naturally, this is dangerous if the block concerned is still in the
- * journal.  If somebody makes a swapfile on an ext4 data-journaling
- * filesystem and enables swap, then they may get a nasty shock when the
- * data getting swapped to that swapfile suddenly gets overwritten by
- * the original zero's written out previously to the journal and
- * awaiting writeback in the kernel's buffer cache.
- *
- * So, if we see any bmap calls here on a modified, data-journaled file,
- * take extra steps to flush any blocks which might be in the cache.
+ * Reserve a metadata for a single block located at lblock
  */
-static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
 {
-	struct inode *inode = mapping->host;
-	journal_t *journal;
-	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int md_needed;
+	ext4_lblk_t save_last_lblock;
+	int save_len;
 
-	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+	spin_lock(&ei->i_block_reservation_lock);
+	/*
+	 * ext4_calc_metadata_amount() has side effects, which we have
+	 * to be prepared undo if we fail to claim space.
+	 */
+	save_len = ei->i_da_metadata_calc_len;
+	save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+	md_needed = EXT4_NUM_B2C(sbi,
+				 ext4_calc_metadata_amount(inode, lblock));
+	trace_ext4_da_reserve_space(inode, md_needed);
+
+	/*
+	 * We do still charge estimated metadata to the sb though;
+	 * we cannot afford to run out of free blocks.
+	 */
+	if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
+		ei->i_da_metadata_calc_len = save_len;
+		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+		spin_unlock(&ei->i_block_reservation_lock);
+		return -ENOSPC;
+	}
+	ei->i_reserved_meta_blocks += md_needed;
+	spin_unlock(&ei->i_block_reservation_lock);
+
+	return 0;       /* success */
+}
+
+/*
+ * Reserve a single cluster located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int md_needed;
+	int ret;
+	ext4_lblk_t save_last_lblock;
+	int save_len;
+
+	/*
+	 * We will charge metadata quota at writeout time; this saves
+	 * us from metadata over-estimation, though we may go over by
+	 * a small amount in the end.  Here we just reserve for data.
+	 */
+	ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
+	if (ret)
+		return ret;
+
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+	spin_lock(&ei->i_block_reservation_lock);
+	/*
+	 * ext4_calc_metadata_amount() has side effects, which we have
+	 * to be prepared undo if we fail to claim space.
+	 */
+	save_len = ei->i_da_metadata_calc_len;
+	save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+	md_needed = EXT4_NUM_B2C(sbi,
+				 ext4_calc_metadata_amount(inode, lblock));
+	trace_ext4_da_reserve_space(inode, md_needed);
+
+	/*
+	 * We do still charge estimated metadata to the sb though;
+	 * we cannot afford to run out of free blocks.
+	 */
+	if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
+		ei->i_da_metadata_calc_len = save_len;
+		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
+		spin_unlock(&ei->i_block_reservation_lock);
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+		return -ENOSPC;
+	}
+	ei->i_reserved_data_blocks++;
+	ei->i_reserved_meta_blocks += md_needed;
+	spin_unlock(&ei->i_block_reservation_lock);
+
+	return 0;       /* success */
+}
+
+static void ext4_da_release_space(struct inode *inode, int to_free)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	if (!to_free)
+		return;		/* Nothing to release, exit */
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	trace_ext4_da_release_space(inode, to_free);
+	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
 		/*
-		 * This is a REALLY heavyweight approach, but the use of
-		 * bmap on dirty files is expected to be extremely rare:
-		 * only if we run lilo or swapon on a freshly made file
-		 * do we expect this to happen.
-		 *
-		 * (bmap requires CAP_SYS_RAWIO so this does not
-		 * represent an unprivileged user DOS attack --- we'd be
-		 * in trouble if mortal users could trigger this path at
-		 * will.)
-		 *
-		 * NB. EXT4_STATE_JDATA is not set on files other than
-		 * regular files.  If somebody wants to bmap a directory
-		 * or symlink and gets confused because the buffer
-		 * hasn't yet been flushed to disk, they deserve
-		 * everything they get.
+		 * if there aren't enough reserved blocks, then the
+		 * counter is messed up somewhere.  Since this
+		 * function is called from invalidate page, it's
+		 * harmless to return without any action.
 		 */
+		ext4_warning(inode->i_sb, "ext4_da_release_space: "
+			 "ino %lu, to_free %d with only %d reserved "
+			 "data blocks", inode->i_ino, to_free,
+			 ei->i_reserved_data_blocks);
+		WARN_ON(1);
+		to_free = ei->i_reserved_data_blocks;
+	}
+	ei->i_reserved_data_blocks -= to_free;
 
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
-		journal = EXT4_JOURNAL(inode);
-		jbd2_journal_lock_updates(journal);
-		err = jbd2_journal_flush(journal);
-		jbd2_journal_unlock_updates(journal);
+	if (ei->i_reserved_data_blocks == 0) {
+		/*
+		 * We can release all of the reserved metadata blocks
+		 * only when we have written all of the delayed
+		 * allocation blocks.
+		 * Note that in case of bigalloc, i_reserved_meta_blocks,
+		 * i_reserved_data_blocks, etc. refer to number of clusters.
+		 */
+		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+				   ei->i_reserved_meta_blocks);
+		ei->i_reserved_meta_blocks = 0;
+		ei->i_da_metadata_calc_len = 0;
+	}
 
-		if (err)
+	/* update fs dirty data blocks counter */
+	percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
+
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+					     unsigned int offset,
+					     unsigned int length)
+{
+	int to_release = 0;
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+	struct inode *inode = page->mapping->host;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned int stop = offset + length;
+	int num_clusters;
+	ext4_fsblk_t lblk;
+
+	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		if (next_off > stop)
+			break;
+
+		if ((offset <= curr_off) && (buffer_delay(bh))) {
+			to_release++;
+			clear_buffer_delay(bh);
+		}
+		curr_off = next_off;
+	} while ((bh = bh->b_this_page) != head);
+
+	if (to_release) {
+		lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		ext4_es_remove_extent(inode, lblk, to_release);
+	}
+
+	/* If we have released all the blocks belonging to a cluster, then we
+	 * need to release the reserved space for that cluster. */
+	num_clusters = EXT4_NUM_B2C(sbi, to_release);
+	while (num_clusters > 0) {
+		lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+			((num_clusters - 1) << sbi->s_cluster_bits);
+		if (sbi->s_cluster_ratio == 1 ||
+		    !ext4_find_delalloc_cluster(inode, lblk))
+			ext4_da_release_space(inode, 1);
+
+		num_clusters--;
+	}
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+	struct inode *inode;
+	struct writeback_control *wbc;
+
+	pgoff_t first_page;	/* The first page to write */
+	pgoff_t next_page;	/* Current page to examine */
+	pgoff_t last_page;	/* Last page to examine */
+	/*
+	 * Extent to map - this can be after first_page because that can be
+	 * fully mapped. We somewhat abuse m_flags to store whether the extent
+	 * is delalloc or unwritten.
+	 */
+	struct ext4_map_blocks map;
+	struct ext4_io_submit io_submit;	/* IO submission data */
+};
+
+static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+				       bool invalidate)
+{
+	int nr_pages, i;
+	pgoff_t index, end;
+	struct pagevec pvec;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	/* This is necessary when next_page == 0. */
+	if (mpd->first_page >= mpd->next_page)
+		return;
+
+	index = mpd->first_page;
+	end   = mpd->next_page - 1;
+	if (invalidate) {
+		ext4_lblk_t start, last;
+		start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		ext4_es_remove_extent(inode, start, last - start + 1);
+	}
+
+	pagevec_init(&pvec, 0);
+	while (index <= end) {
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			if (page->index > end)
+				break;
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			if (invalidate) {
+				block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+				ClearPageUptodate(page);
+			}
+			unlock_page(page);
+		}
+		index = pvec.pages[nr_pages - 1]->index + 1;
+		pagevec_release(&pvec);
+	}
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct super_block *sb = inode->i_sb;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+
+	ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
+	       EXT4_C2B(EXT4_SB(inode->i_sb),
+			ext4_count_free_clusters(sb)));
+	ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
+	ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
+	       (long long) EXT4_C2B(EXT4_SB(sb),
+		percpu_counter_sum(&sbi->s_freeclusters_counter)));
+	ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
+	       (long long) EXT4_C2B(EXT4_SB(sb),
+		percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+	ext4_msg(sb, KERN_CRIT, "Block reservation details");
+	ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
+		 ei->i_reserved_data_blocks);
+	ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
+	       ei->i_reserved_meta_blocks);
+	ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
+	       ei->i_allocated_meta_blocks);
+	return;
+}
+
+static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
+{
+	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
+/*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+			      struct ext4_map_blocks *map,
+			      struct buffer_head *bh)
+{
+	struct extent_status es;
+	int retval;
+	sector_t invalid_block = ~((sector_t) 0xffff);
+#ifdef ES_AGGRESSIVE_TEST
+	struct ext4_map_blocks orig_map;
+
+	memcpy(&orig_map, map, sizeof(*map));
+#endif
+
+	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+		invalid_block = ~0;
+
+	map->m_flags = 0;
+	ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, map->m_len,
+		  (unsigned long) map->m_lblk);
+
+	/* Lookup extent status tree firstly */
+	if (ext4_es_lookup_extent(inode, iblock, &es)) {
+		ext4_es_lru_add(inode);
+		if (ext4_es_is_hole(&es)) {
+			retval = 0;
+			down_read(&EXT4_I(inode)->i_data_sem);
+			goto add_delayed;
+		}
+
+		/*
+		 * Delayed extent could be allocated by fallocate.
+		 * So we need to check it.
+		 */
+		if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
+			map_bh(bh, inode->i_sb, invalid_block);
+			set_buffer_new(bh);
+			set_buffer_delay(bh);
 			return 0;
+		}
+
+		map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
+		retval = es.es_len - (iblock - es.es_lblk);
+		if (retval > map->m_len)
+			retval = map->m_len;
+		map->m_len = retval;
+		if (ext4_es_is_written(&es))
+			map->m_flags |= EXT4_MAP_MAPPED;
+		else if (ext4_es_is_unwritten(&es))
+			map->m_flags |= EXT4_MAP_UNWRITTEN;
+		else
+			BUG_ON(1);
+
+#ifdef ES_AGGRESSIVE_TEST
+		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
+#endif
+		return retval;
 	}
 
-	return generic_block_bmap(mapping,block,ext4_get_block);
+	/*
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
+	 */
+	down_read(&EXT4_I(inode)->i_data_sem);
+	if (ext4_has_inline_data(inode)) {
+		/*
+		 * We will soon create blocks for this page, and let
+		 * us pretend as if the blocks aren't allocated yet.
+		 * In case of clusters, we have to handle the work
+		 * of mapping from cluster so that the reserved space
+		 * is calculated properly.
+		 */
+		if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+		    ext4_find_delalloc_cluster(inode, map->m_lblk))
+			map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+		retval = 0;
+	} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		retval = ext4_ext_map_blocks(NULL, inode, map,
+					     EXT4_GET_BLOCKS_NO_PUT_HOLE);
+	else
+		retval = ext4_ind_map_blocks(NULL, inode, map,
+					     EXT4_GET_BLOCKS_NO_PUT_HOLE);
+
+add_delayed:
+	if (retval == 0) {
+		int ret;
+		/*
+		 * XXX: __block_prepare_write() unmaps passed block,
+		 * is it OK?
+		 */
+		/*
+		 * If the block was allocated from previously allocated cluster,
+		 * then we don't need to reserve it again. However we still need
+		 * to reserve metadata for every block we're going to write.
+		 */
+		if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+			ret = ext4_da_reserve_space(inode, iblock);
+			if (ret) {
+				/* not enough space to reserve */
+				retval = ret;
+				goto out_unlock;
+			}
+		} else {
+			ret = ext4_da_reserve_metadata(inode, iblock);
+			if (ret) {
+				/* not enough space to reserve */
+				retval = ret;
+				goto out_unlock;
+			}
+		}
+
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    ~0, EXTENT_STATUS_DELAYED);
+		if (ret) {
+			retval = ret;
+			goto out_unlock;
+		}
+
+		/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+		 * and it should not appear on the bh->b_state.
+		 */
+		map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+
+		map_bh(bh, inode->i_sb, invalid_block);
+		set_buffer_new(bh);
+		set_buffer_delay(bh);
+	} else if (retval > 0) {
+		int ret;
+		unsigned int status;
+
+		if (unlikely(retval != map->m_len)) {
+			ext4_warning(inode->i_sb,
+				     "ES len assertion failed for inode "
+				     "%lu: retval %d != map->m_len %d",
+				     inode->i_ino, retval, map->m_len);
+			WARN_ON(1);
+		}
+
+		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+					    map->m_pblk, status);
+		if (ret != 0)
+			retval = ret;
+	}
+
+out_unlock:
+	up_read((&EXT4_I(inode)->i_data_sem));
+
+	return retval;
+}
+
+/*
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
+ */
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int create)
+{
+	struct ext4_map_blocks map;
+	int ret = 0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+	map.m_lblk = iblock;
+	map.m_len = 1;
+
+	/*
+	 * first, we need to know whether the block is allocated already
+	 * preallocated blocks are unmapped but should treated
+	 * the same as allocated blocks.
+	 */
+	ret = ext4_da_map_blocks(inode, iblock, &map, bh);
+	if (ret <= 0)
+		return ret;
+
+	map_bh(bh, inode->i_sb, map.m_pblk);
+	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+	if (buffer_unwritten(bh)) {
+		/* A delayed write to unwritten bh should be marked
+		 * new and mapped.  Mapped ensures that we don't do
+		 * get_block multiple times when we write to the same
+		 * offset and new ensures that we do proper zero out
+		 * for partial write.
+		 */
+		set_buffer_new(bh);
+		set_buffer_mapped(bh);
+	}
+	return 0;
 }
 
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -1450,338 +1731,1527 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+static int __ext4_journalled_writepage(struct page *page,
+				       unsigned int len)
 {
-	if (buffer_mapped(bh))
-		return ext4_journal_dirty_data(handle, bh);
-	return 0;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs = NULL;
+	handle_t *handle = NULL;
+	int ret = 0, err = 0;
+	int inline_data = ext4_has_inline_data(inode);
+	struct buffer_head *inode_bh = NULL;
+
+	ClearPageChecked(page);
+
+	if (inline_data) {
+		BUG_ON(page->index != 0);
+		BUG_ON(len > ext4_get_max_inline_size(inode));
+		inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+		if (inode_bh == NULL)
+			goto out;
+	} else {
+		page_bufs = page_buffers(page);
+		if (!page_bufs) {
+			BUG();
+			goto out;
+		}
+		ext4_walk_page_buffers(handle, page_bufs, 0, len,
+				       NULL, bget_one);
+	}
+	/* As soon as we unlock the page, it can go away, but we have
+	 * references to buffers so we are safe */
+	unlock_page(page);
+
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+				    ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	BUG_ON(!ext4_handle_valid(handle));
+
+	if (inline_data) {
+		BUFFER_TRACE(inode_bh, "get write access");
+		ret = ext4_journal_get_write_access(handle, inode_bh);
+
+		err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
+
+	} else {
+		ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+					     do_journal_get_write_access);
+
+		err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+					     write_end_fn);
+	}
+	if (ret == 0)
+		ret = err;
+	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+
+	if (!ext4_has_inline_data(inode))
+		ext4_walk_page_buffers(NULL, page_bufs, 0, len,
+				       NULL, bput_one);
+	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+out:
+	brelse(inode_bh);
+	return ret;
 }
 
 /*
- * Note that we always start a transaction even if we're not journalling
- * data.  This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), no one guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
+ *
+ * This function can get called via...
+ *   - ext4_writepages after taking page lock (have journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
+ *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
  *
- * In all journalling modes block_write_full_page() will start the I/O.
+ * We don't do any block allocation in this function. If we have page with
+ * multiple blocks we need to write those buffer_heads that are mapped. This
+ * is important for mmaped based write. So if we do with blocksize 1K
+ * truncate(f, 1024);
+ * a = mmap(f, 0, 4096);
+ * a[0] = 'a';
+ * truncate(f, 4096);
+ * we have in the page first buffer_head mapped via page_mkwrite call back
+ * but other buffer_heads would be unmapped but dirty (dirty done via the
+ * do_wp_page). So writepage should write the first block. If we modify
+ * the mmap area beyond 1024 we will again get a page_fault and the
+ * page_mkwrite callback will do the block allocation and mark the
+ * buffer_heads mapped.
  *
- * Problem:
+ * We redirty the page if we have any buffer_heads that is either delay or
+ * unwritten in the page.
+ *
+ * We can get recursively called as show below.
  *
  *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
  *		ext4_writepage()
  *
- * Similar for:
+ * But since we don't do any block allocation we should not deadlock.
+ * Page also have the dirty flag cleared so we don't get recurive page_lock.
+ */
+static int ext4_writepage(struct page *page,
+			  struct writeback_control *wbc)
+{
+	int ret = 0;
+	loff_t size;
+	unsigned int len;
+	struct buffer_head *page_bufs = NULL;
+	struct inode *inode = page->mapping->host;
+	struct ext4_io_submit io_submit;
+	bool keep_towrite = false;
+
+	trace_ext4_writepage(page);
+	size = i_size_read(inode);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	page_bufs = page_buffers(page);
+	/*
+	 * We cannot do block allocation or other extent handling in this
+	 * function. If there are buffers needing that, we have to redirty
+	 * the page. But we may reach here when we do a journal commit via
+	 * journal_submit_inode_data_buffers() and in that case we must write
+	 * allocated buffers to achieve data=ordered mode guarantees.
+	 */
+	if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+				   ext4_bh_delay_or_unwritten)) {
+		redirty_page_for_writepage(wbc, page);
+		if (current->flags & PF_MEMALLOC) {
+			/*
+			 * For memory cleaning there's no point in writing only
+			 * some buffers. So just bail out. Warn if we came here
+			 * from direct reclaim.
+			 */
+			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+							== PF_MEMALLOC);
+			unlock_page(page);
+			return 0;
+		}
+		keep_towrite = true;
+	}
+
+	if (PageChecked(page) && ext4_should_journal_data(inode))
+		/*
+		 * It's mmapped pagecache.  Add buffers and journal it.  There
+		 * doesn't seem much point in redirtying the page here.
+		 */
+		return __ext4_journalled_writepage(page, len);
+
+	ext4_io_submit_init(&io_submit, wbc);
+	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+	if (!io_submit.io_end) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return -ENOMEM;
+	}
+	ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
+	ext4_io_submit(&io_submit);
+	/* Drop io_end reference we got from init */
+	ext4_put_io_end_defer(io_submit.io_end);
+	return ret;
+}
+
+static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+{
+	int len;
+	loff_t size = i_size_read(mpd->inode);
+	int err;
+
+	BUG_ON(page->index != mpd->first_page);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	clear_page_dirty_for_io(page);
+	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
+	if (!err)
+		mpd->wbc->nr_to_write--;
+	mpd->first_page++;
+
+	return err;
+}
+
+#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
+
+/*
+ * mballoc gives us at most this number of blocks...
+ * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
+ * The rest of mballoc seems to handle chunks up to full group size.
+ */
+#define MAX_WRITEPAGES_EXTENT_LEN 2048
+
+/*
+ * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
+ *
+ * @mpd - extent of blocks
+ * @lblk - logical number of the block in the file
+ * @bh - buffer head we want to add to the extent
  *
- *	ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+ * The function is used to collect contig. blocks in the same state. If the
+ * buffer doesn't require mapping for writeback and we haven't started the
+ * extent of buffers to map yet, the function returns 'true' immediately - the
+ * caller can write the buffer right away. Otherwise the function returns true
+ * if the block has been added to the extent, false if the block couldn't be
+ * added.
+ */
+static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
+				   struct buffer_head *bh)
+{
+	struct ext4_map_blocks *map = &mpd->map;
+
+	/* Buffer that doesn't need mapping for writeback? */
+	if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
+	    (!buffer_delay(bh) && !buffer_unwritten(bh))) {
+		/* So far no extent to map => we write the buffer right away */
+		if (map->m_len == 0)
+			return true;
+		return false;
+	}
+
+	/* First block in the extent? */
+	if (map->m_len == 0) {
+		map->m_lblk = lblk;
+		map->m_len = 1;
+		map->m_flags = bh->b_state & BH_FLAGS;
+		return true;
+	}
+
+	/* Don't go larger than mballoc is willing to allocate */
+	if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
+		return false;
+
+	/* Can we merge the block to our big extent? */
+	if (lblk == map->m_lblk + map->m_len &&
+	    (bh->b_state & BH_FLAGS) == map->m_flags) {
+		map->m_len++;
+		return true;
+	}
+	return false;
+}
+
+/*
+ * mpage_process_page_bufs - submit page buffers for IO or add them to extent
  *
- * Same applies to ext4_get_block().  We will deadlock on various things like
- * lock_journal and i_data_sem
+ * @mpd - extent of blocks for mapping
+ * @head - the first buffer in the page
+ * @bh - buffer we should start processing from
+ * @lblk - logical number of the block in the file corresponding to @bh
  *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
+ * Walk through page buffers from @bh upto @head (exclusive) and either submit
+ * the page for IO if all buffers in this page were mapped and there's no
+ * accumulated extent of buffers to map or add buffers in the page to the
+ * extent of buffers to map. The function returns 1 if the caller can continue
+ * by processing the next page, 0 if it should stop adding buffers to the
+ * extent to map because we cannot extend it anymore. It can also return value
+ * < 0 in case of error during IO submission.
+ */
+static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+				   struct buffer_head *head,
+				   struct buffer_head *bh,
+				   ext4_lblk_t lblk)
+{
+	struct inode *inode = mpd->inode;
+	int err;
+	ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
+							>> inode->i_blkbits;
+
+	do {
+		BUG_ON(buffer_locked(bh));
+
+		if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
+			/* Found extent to map? */
+			if (mpd->map.m_len)
+				return 0;
+			/* Everything mapped so far and we hit EOF */
+			break;
+		}
+	} while (lblk++, (bh = bh->b_this_page) != head);
+	/* So far everything mapped? Submit the page for IO. */
+	if (mpd->map.m_len == 0) {
+		err = mpage_submit_page(mpd, head->b_page);
+		if (err < 0)
+			return err;
+	}
+	return lblk < blocks;
+}
+
+/*
+ * mpage_map_buffers - update buffers corresponding to changed extent and
+ *		       submit fully mapped pages for IO
  *
- * 16May01: If we're reentered then journal_current_handle() will be
- *	    non-zero. We simply *return*.
+ * @mpd - description of extent to map, on return next extent to map
  *
- * 1 July 2001: @@@ FIXME:
- *   In journalled data mode, a data buffer may be metadata against the
- *   current transaction.  But the same file is part of a shared mapping
- *   and someone does a writepage() on it.
+ * Scan buffers corresponding to changed extent (we expect corresponding pages
+ * to be already locked) and update buffer state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits,
+ * and mark buffers as uninit when we perform writes to unwritten extents
+ * and do extent conversion after IO is finished. If the last page is not fully
+ * mapped, we update @map to the next extent in the last page that needs
+ * mapping. Otherwise we submit the page for IO.
+ */
+static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+{
+	struct pagevec pvec;
+	int nr_pages, i;
+	struct inode *inode = mpd->inode;
+	struct buffer_head *head, *bh;
+	int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+	pgoff_t start, end;
+	ext4_lblk_t lblk;
+	sector_t pblock;
+	int err;
+
+	start = mpd->map.m_lblk >> bpp_bits;
+	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
+	lblk = start << bpp_bits;
+	pblock = mpd->map.m_pblk;
+
+	pagevec_init(&pvec, 0);
+	while (start <= end) {
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
+					  PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (page->index > end)
+				break;
+			/* Up to 'end' pages must be contiguous */
+			BUG_ON(page->index != start);
+			bh = head = page_buffers(page);
+			do {
+				if (lblk < mpd->map.m_lblk)
+					continue;
+				if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+					/*
+					 * Buffer after end of mapped extent.
+					 * Find next buffer in the page to map.
+					 */
+					mpd->map.m_len = 0;
+					mpd->map.m_flags = 0;
+					/*
+					 * FIXME: If dioread_nolock supports
+					 * blocksize < pagesize, we need to make
+					 * sure we add size mapped so far to
+					 * io_end->size as the following call
+					 * can submit the page for IO.
+					 */
+					err = mpage_process_page_bufs(mpd, head,
+								      bh, lblk);
+					pagevec_release(&pvec);
+					if (err > 0)
+						err = 0;
+					return err;
+				}
+				if (buffer_delay(bh)) {
+					clear_buffer_delay(bh);
+					bh->b_blocknr = pblock++;
+				}
+				clear_buffer_unwritten(bh);
+			} while (lblk++, (bh = bh->b_this_page) != head);
+
+			/*
+			 * FIXME: This is going to break if dioread_nolock
+			 * supports blocksize < pagesize as we will try to
+			 * convert potentially unmapped parts of inode.
+			 */
+			mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+			/* Page fully mapped - let IO run! */
+			err = mpage_submit_page(mpd, page);
+			if (err < 0) {
+				pagevec_release(&pvec);
+				return err;
+			}
+			start++;
+		}
+		pagevec_release(&pvec);
+	}
+	/* Extent fully mapped and matches with page boundary. We are done. */
+	mpd->map.m_len = 0;
+	mpd->map.m_flags = 0;
+	return 0;
+}
+
+static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
+{
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int get_blocks_flags;
+	int err, dioread_nolock;
+
+	trace_ext4_da_write_pages_extent(inode, map);
+	/*
+	 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
+	 * to convert an unwritten extent to be initialized (in the case
+	 * where we have written into one or more preallocated blocks).  It is
+	 * possible that we're going to need more metadata blocks than
+	 * previously reserved. However we must not fail because we're in
+	 * writeback and there is nothing we can do about it so it might result
+	 * in data loss.  So use reserved blocks to allocate metadata if
+	 * possible.
+	 *
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
+	 * in question are delalloc blocks.  This affects functions in many
+	 * different parts of the allocation call path.  This flag exists
+	 * primarily because we don't want to change *many* call functions, so
+	 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
+	 * once the inode's allocation semaphore is taken.
+	 */
+	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
+			   EXT4_GET_BLOCKS_METADATA_NOFAIL;
+	dioread_nolock = ext4_should_dioread_nolock(inode);
+	if (dioread_nolock)
+		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
+	if (map->m_flags & (1 << BH_Delay))
+		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
+	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
+	if (err < 0)
+		return err;
+	if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
+		if (!mpd->io_submit.io_end->handle &&
+		    ext4_handle_valid(handle)) {
+			mpd->io_submit.io_end->handle = handle->h_rsv_handle;
+			handle->h_rsv_handle = NULL;
+		}
+		ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+	}
+
+	BUG_ON(map->m_len == 0);
+	if (map->m_flags & EXT4_MAP_NEW) {
+		struct block_device *bdev = inode->i_sb->s_bdev;
+		int i;
+
+		for (i = 0; i < map->m_len; i++)
+			unmap_underlying_metadata(bdev, map->m_pblk + i);
+	}
+	return 0;
+}
+
+/*
+ * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
+ *				 mpd->len and submit pages underlying it for IO
  *
- *   We will move the buffer onto the async_data list, but *after* it has
- *   been dirtied. So there's a small window where we have dirty data on
- *   BJ_Metadata.
+ * @handle - handle for journal operations
+ * @mpd - extent to map
+ * @give_up_on_write - we set this to true iff there is a fatal error and there
+ *                     is no hope of writing the data. The caller should discard
+ *                     dirty pages to avoid infinite loops.
  *
- *   Note that this only applies to the last partial page in the file.  The
- *   bit which block_write_full_page() uses prepare/commit for.  (That's
- *   broken code anyway: it's wrong for msync()).
+ * The function maps extent starting at mpd->lblk of length mpd->len. If it is
+ * delayed, blocks are allocated, if it is unwritten, we may need to convert
+ * them to initialized or split the described range from larger unwritten
+ * extent. Note that we need not map all the described range since allocation
+ * can return less blocks or the range is covered by more unwritten extents. We
+ * cannot map more because we are limited by reserved transaction credits. On
+ * the other hand we always make sure that the last touched page is fully
+ * mapped so that it can be written out (and thus forward progress is
+ * guaranteed). After mapping we submit all mapped pages for IO.
+ */
+static int mpage_map_and_submit_extent(handle_t *handle,
+				       struct mpage_da_data *mpd,
+				       bool *give_up_on_write)
+{
+	struct inode *inode = mpd->inode;
+	struct ext4_map_blocks *map = &mpd->map;
+	int err;
+	loff_t disksize;
+
+	mpd->io_submit.io_end->offset =
+				((loff_t)map->m_lblk) << inode->i_blkbits;
+	do {
+		err = mpage_map_one_extent(handle, mpd);
+		if (err < 0) {
+			struct super_block *sb = inode->i_sb;
+
+			if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+				goto invalidate_dirty_pages;
+			/*
+			 * Let the uper layers retry transient errors.
+			 * In the case of ENOSPC, if ext4_count_free_blocks()
+			 * is non-zero, a commit should free up blocks.
+			 */
+			if ((err == -ENOMEM) ||
+			    (err == -ENOSPC && ext4_count_free_clusters(sb)))
+				return err;
+			ext4_msg(sb, KERN_CRIT,
+				 "Delayed block allocation failed for "
+				 "inode %lu at logical offset %llu with"
+				 " max blocks %u with error %d",
+				 inode->i_ino,
+				 (unsigned long long)map->m_lblk,
+				 (unsigned)map->m_len, -err);
+			ext4_msg(sb, KERN_CRIT,
+				 "This should not happen!! Data will "
+				 "be lost\n");
+			if (err == -ENOSPC)
+				ext4_print_free_blocks(inode);
+		invalidate_dirty_pages:
+			*give_up_on_write = true;
+			return err;
+		}
+		/*
+		 * Update buffer state, submit mapped pages, and get us new
+		 * extent to map
+		 */
+		err = mpage_map_and_submit_buffers(mpd);
+		if (err < 0)
+			return err;
+	} while (map->m_len);
+
+	/*
+	 * Update on-disk size after IO is submitted.  Races with
+	 * truncate are avoided by checking i_size under i_data_sem.
+	 */
+	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+	if (disksize > EXT4_I(inode)->i_disksize) {
+		int err2;
+		loff_t i_size;
+
+		down_write(&EXT4_I(inode)->i_data_sem);
+		i_size = i_size_read(inode);
+		if (disksize > i_size)
+			disksize = i_size;
+		if (disksize > EXT4_I(inode)->i_disksize)
+			EXT4_I(inode)->i_disksize = disksize;
+		err2 = ext4_mark_inode_dirty(handle, inode);
+		up_write(&EXT4_I(inode)->i_data_sem);
+		if (err2)
+			ext4_error(inode->i_sb,
+				   "Failed to mark inode %lu dirty",
+				   inode->i_ino);
+		if (!err)
+			err = err2;
+	}
+	return err;
+}
+
+/*
+ * Calculate the total number of credits to reserve for one writepages
+ * iteration. This is called from ext4_writepages(). We map an extent of
+ * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
+ * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
+ * bpp - 1 blocks in bpp different extents.
+ */
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+	int bpp = ext4_journal_blocks_per_page(inode);
+
+	return ext4_meta_trans_blocks(inode,
+				MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
+}
+
+/*
+ * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+ * 				 and underlying extent to map
  *
- *   It's a rare case: affects the final partial page, for journalled data
- *   where the file is subject to bith write() and writepage() in the same
- *   transction.  To fix it we'll need a custom block_write_full_page().
- *   We'll probably need that anyway for journalling writepage() output.
+ * @mpd - where to look for pages
  *
- * We don't honour synchronous mounts for writepage().  That would be
- * disastrous.  Any write() or metadata operation will sync the fs for
- * us.
+ * Walk dirty pages in the mapping. If they are fully mapped, submit them for
+ * IO immediately. When we find a page which isn't mapped we start accumulating
+ * extent of buffers underlying these pages that needs mapping (formed by
+ * either delayed or unwritten buffers). We also lock the pages containing
+ * these buffers. The extent found is returned in @mpd structure (starting at
+ * mpd->lblk with length mpd->len blocks).
  *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
+ * Note that this function can attach bios to one io_end structure which are
+ * neither logically nor physically contiguous. Although it may seem as an
+ * unnecessary complication, it is actually inevitable in blocksize < pagesize
+ * case as we need to track IO to all buffers underlying a page in one io_end.
  */
-static int ext4_ordered_writepage(struct page *page,
-				struct writeback_control *wbc)
+static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 {
-	struct inode *inode = page->mapping->host;
-	struct buffer_head *page_bufs;
+	struct address_space *mapping = mpd->inode->i_mapping;
+	struct pagevec pvec;
+	unsigned int nr_pages;
+	long left = mpd->wbc->nr_to_write;
+	pgoff_t index = mpd->first_page;
+	pgoff_t end = mpd->last_page;
+	int tag;
+	int i, err = 0;
+	int blkbits = mpd->inode->i_blkbits;
+	ext4_lblk_t lblk;
+	struct buffer_head *head;
+
+	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+
+	pagevec_init(&pvec, 0);
+	mpd->map.m_len = 0;
+	mpd->next_page = index;
+	while (index <= end) {
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			goto out;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
+			 * mapping. However, page->index will not change
+			 * because we have a reference on the page.
+			 */
+			if (page->index > end)
+				goto out;
+
+			/*
+			 * Accumulated enough dirty pages? This doesn't apply
+			 * to WB_SYNC_ALL mode. For integrity sync we have to
+			 * keep going because someone may be concurrently
+			 * dirtying pages, and we might have synced a lot of
+			 * newly appeared dirty pages, but have not synced all
+			 * of the old dirty pages.
+			 */
+			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+				goto out;
+
+			/* If we can't merge this page, we are done. */
+			if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+				goto out;
+
+			lock_page(page);
+			/*
+			 * If the page is no longer dirty, or its mapping no
+			 * longer corresponds to inode we are writing (which
+			 * means it has been truncated or invalidated), or the
+			 * page is already under writeback and we are not doing
+			 * a data integrity writeback, skip the page
+			 */
+			if (!PageDirty(page) ||
+			    (PageWriteback(page) &&
+			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
+			    unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			wait_on_page_writeback(page);
+			BUG_ON(PageWriteback(page));
+
+			if (mpd->map.m_len == 0)
+				mpd->first_page = page->index;
+			mpd->next_page = page->index + 1;
+			/* Add all dirty buffers to mpd */
+			lblk = ((ext4_lblk_t)page->index) <<
+				(PAGE_CACHE_SHIFT - blkbits);
+			head = page_buffers(page);
+			err = mpage_process_page_bufs(mpd, head, head, lblk);
+			if (err <= 0)
+				goto out;
+			err = 0;
+			left--;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return 0;
+out:
+	pagevec_release(&pvec);
+	return err;
+}
+
+static int __writepage(struct page *page, struct writeback_control *wbc,
+		       void *data)
+{
+	struct address_space *mapping = data;
+	int ret = ext4_writepage(page, wbc);
+	mapping_set_error(mapping, ret);
+	return ret;
+}
+
+static int ext4_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	pgoff_t	writeback_index = 0;
+	long nr_to_write = wbc->nr_to_write;
+	int range_whole = 0;
+	int cycled = 1;
 	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	struct mpage_da_data mpd;
+	struct inode *inode = mapping->host;
+	int needed_blocks, rsv_blocks = 0, ret = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+	bool done;
+	struct blk_plug plug;
+	bool give_up_on_write = false;
 
-	J_ASSERT(PageLocked(page));
+	trace_ext4_writepages(inode, wbc);
 
 	/*
-	 * We give up here if we're reentered, because it might be for a
-	 * different filesystem.
+	 * No pages to write? This is mainly a kludge to avoid starting
+	 * a transaction for special inodes like journal inode on last iput()
+	 * because that could violate lock ordering on umount
 	 */
-	if (ext4_journal_current_handle())
-		goto out_fail;
+	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+		goto out_writepages;
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (ext4_should_journal_data(inode)) {
+		struct blk_plug plug;
 
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
+		blk_start_plug(&plug);
+		ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+		blk_finish_plug(&plug);
+		goto out_writepages;
 	}
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	/*
+	 * If the filesystem has aborted, it is read-only, so return
+	 * right away instead of dumping stack traces later on that
+	 * will obscure the real source of the problem.  We test
+	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
+	 * the latter could be true if the filesystem is mounted
+	 * read-only, and in that case, ext4_writepages should
+	 * *never* be called, so if that ever happens, we would want
+	 * the stack trace.
+	 */
+	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+		ret = -EROFS;
+		goto out_writepages;
 	}
-	page_bufs = page_buffers(page);
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bget_one);
 
-	ret = block_write_full_page(page, ext4_get_block, wbc);
+	if (ext4_should_dioread_nolock(inode)) {
+		/*
+		 * We may need to convert up to one extent per block in
+		 * the page and we may dirty the inode.
+		 */
+		rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+	}
 
 	/*
-	 * The page can become unlocked at any point now, and
-	 * truncate can then come in and change things.  So we
-	 * can't touch *page from now on.  But *page_bufs is
-	 * safe due to elevated refcount.
+	 * If we have inline data and arrive here, it means that
+	 * we will soon create the block for the 1st page, so
+	 * we'd better clear the inline data here.
 	 */
+	if (ext4_has_inline_data(inode)) {
+		/* Just inode will be modified... */
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		BUG_ON(ext4_test_inode_state(inode,
+				EXT4_STATE_MAY_INLINE_DATA));
+		ext4_destroy_inline_data(handle, inode);
+		ext4_journal_stop(handle);
+	}
+
+	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+		range_whole = 1;
+
+	if (wbc->range_cyclic) {
+		writeback_index = mapping->writeback_index;
+		if (writeback_index)
+			cycled = 0;
+		mpd.first_page = writeback_index;
+		mpd.last_page = -1;
+	} else {
+		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
+		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
+	}
+
+	mpd.inode = inode;
+	mpd.wbc = wbc;
+	ext4_io_submit_init(&mpd.io_submit, wbc);
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
+	done = false;
+	blk_start_plug(&plug);
+	while (!done && mpd.first_page <= mpd.last_page) {
+		/* For each extent of pages we use new io_end */
+		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
+		if (!mpd.io_submit.io_end) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * We have two constraints: We find one extent to map and we
+		 * must always write out whole page (makes a difference when
+		 * blocksize < pagesize) so that we don't block on IO when we
+		 * try to write out the rest of the page. Journalled mode is
+		 * not supported by delalloc.
+		 */
+		BUG_ON(ext4_should_journal_data(inode));
+		needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+		/* start a new transaction */
+		handle = ext4_journal_start_with_reserve(inode,
+				EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
+			       "%ld pages, ino %lu; err %d", __func__,
+				wbc->nr_to_write, inode->i_ino, ret);
+			/* Release allocated io_end */
+			ext4_put_io_end(mpd.io_submit.io_end);
+			break;
+		}
+
+		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
+		ret = mpage_prepare_extent_to_map(&mpd);
+		if (!ret) {
+			if (mpd.map.m_len)
+				ret = mpage_map_and_submit_extent(handle, &mpd,
+					&give_up_on_write);
+			else {
+				/*
+				 * We scanned the whole range (or exhausted
+				 * nr_to_write), submitted what was mapped and
+				 * didn't find anything needing mapping. We are
+				 * done.
+				 */
+				done = true;
+			}
+		}
+		ext4_journal_stop(handle);
+		/* Submit prepared bio */
+		ext4_io_submit(&mpd.io_submit);
+		/* Unlock pages we didn't use */
+		mpage_release_unused_pages(&mpd, give_up_on_write);
+		/* Drop our io_end reference we got from init */
+		ext4_put_io_end(mpd.io_submit.io_end);
+
+		if (ret == -ENOSPC && sbi->s_journal) {
+			/*
+			 * Commit the transaction which would
+			 * free blocks released in the transaction
+			 * and try again
+			 */
+			jbd2_journal_force_commit_nested(sbi->s_journal);
+			ret = 0;
+			continue;
+		}
+		/* Fatal error - ENOMEM, EIO... */
+		if (ret)
+			break;
+	}
+	blk_finish_plug(&plug);
+	if (!ret && !cycled && wbc->nr_to_write > 0) {
+		cycled = 1;
+		mpd.last_page = writeback_index - 1;
+		mpd.first_page = 0;
+		goto retry;
+	}
+
+	/* Update index */
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		/*
+		 * Set the writeback_index so that range_cyclic
+		 * mode will write it back later
+		 */
+		mapping->writeback_index = mpd.first_page;
+
+out_writepages:
+	trace_ext4_writepages_result(inode, wbc, ret,
+				     nr_to_write - wbc->nr_to_write);
+	return ret;
+}
+
+static int ext4_nonda_switch(struct super_block *sb)
+{
+	s64 free_clusters, dirty_clusters;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	/*
-	 * And attach them to the current transaction.  But only if
-	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
-	 * and generally junk.
+	 * switch to non delalloc mode if we are running low
+	 * on free block. The free block accounting via percpu
+	 * counters can get slightly wrong with percpu_counter_batch getting
+	 * accumulated on each CPU without updating global counters
+	 * Delalloc need an accurate free block accounting. So switch
+	 * to non delalloc when we are near to error range.
 	 */
-	if (ret == 0) {
-		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
-					NULL, jbd2_journal_dirty_data_fn);
-		if (!ret)
-			ret = err;
+	free_clusters =
+		percpu_counter_read_positive(&sbi->s_freeclusters_counter);
+	dirty_clusters =
+		percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
+	/*
+	 * Start pushing delalloc when 1/2 of free blocks are dirty.
+	 */
+	if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
+		try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
+
+	if (2 * free_clusters < 3 * dirty_clusters ||
+	    free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
+		/*
+		 * free block count is less than 150% of dirty blocks
+		 * or free blocks is less than watermark
+		 */
+		return 1;
 	}
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bput_one);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
+	return 0;
+}
 
-out_fail:
-	redirty_page_for_writepage(wbc, page);
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+			       loff_t pos, unsigned len, unsigned flags,
+			       struct page **pagep, void **fsdata)
+{
+	int ret, retries = 0;
+	struct page *page;
+	pgoff_t index;
+	struct inode *inode = mapping->host;
+	handle_t *handle;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+
+	if (ext4_nonda_switch(inode->i_sb)) {
+		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+		return ext4_write_begin(file, mapping, pos,
+					len, flags, pagep, fsdata);
+	}
+	*fsdata = (void *)0;
+	trace_ext4_da_write_begin(inode, pos, len, flags);
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+		ret = ext4_da_write_inline_data_begin(mapping, inode,
+						      pos, len, flags,
+						      pagep, fsdata);
+		if (ret < 0)
+			return ret;
+		if (ret == 1)
+			return 0;
+	}
+
+	/*
+	 * grab_cache_page_write_begin() can take a long time if the
+	 * system is thrashing due to memory pressure, or if the page
+	 * is being written back.  So grab it first before we start
+	 * the transaction handle.  This also allows us to allocate
+	 * the page (if needed) without using GFP_NOFS.
+	 */
+retry_grab:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page)
+		return -ENOMEM;
 	unlock_page(page);
+
+	/*
+	 * With delayed allocation, we don't log the i_disksize update
+	 * if there is delayed block allocation. But we still need
+	 * to journalling the i_disksize update if writes to the end
+	 * of file which has an already mapped buffer.
+	 */
+retry_journal:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1);
+	if (IS_ERR(handle)) {
+		page_cache_release(page);
+		return PTR_ERR(handle);
+	}
+
+	lock_page(page);
+	if (page->mapping != mapping) {
+		/* The page got truncated from under us */
+		unlock_page(page);
+		page_cache_release(page);
+		ext4_journal_stop(handle);
+		goto retry_grab;
+	}
+	/* In case writeback began while the page was unlocked */
+	wait_for_stable_page(page);
+
+	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
+	if (ret < 0) {
+		unlock_page(page);
+		ext4_journal_stop(handle);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			ext4_truncate_failed_write(inode);
+
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry_journal;
+
+		page_cache_release(page);
+		return ret;
+	}
+
+	*pagep = page;
 	return ret;
 }
 
-static int ext4_writeback_writepage(struct page *page,
-				struct writeback_control *wbc)
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+					    unsigned long offset)
 {
+	struct buffer_head *bh;
 	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	unsigned int idx;
+	int i;
 
-	if (ext4_journal_current_handle())
-		goto out_fail;
+	bh = page_buffers(page);
+	idx = offset >> inode->i_blkbits;
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
+	for (i = 0; i < idx; i++)
+		bh = bh->b_this_page;
+
+	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
+		return 0;
+	return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned copied,
+			     struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	handle_t *handle = ext4_journal_current_handle();
+	loff_t new_i_size;
+	unsigned long start, end;
+	int write_mode = (int)(unsigned long)fsdata;
+
+	if (write_mode == FALL_BACK_TO_NONDELALLOC)
+		return ext4_write_end(file, mapping, pos,
+				      len, copied, page, fsdata);
+
+	trace_ext4_da_write_end(inode, pos, len, copied);
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	end = start + copied - 1;
+
+	/*
+	 * generic_write_end() will run mark_inode_dirty() if i_size
+	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+	 * into that.
+	 */
+	new_i_size = pos + copied;
+	if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+		if (ext4_has_inline_data(inode) ||
+		    ext4_da_should_update_i_disksize(page, end)) {
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (new_i_size > EXT4_I(inode)->i_disksize)
+				EXT4_I(inode)->i_disksize = new_i_size;
+			up_write(&EXT4_I(inode)->i_data_sem);
+			/* We need to mark inode dirty even if
+			 * new_i_size is less that inode->i_size
+			 * bu greater than i_disksize.(hint delalloc)
+			 */
+			ext4_mark_inode_dirty(handle, inode);
+		}
 	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_get_block, wbc);
+	if (write_mode != CONVERT_INLINE_DATA &&
+	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+	    ext4_has_inline_data(inode))
+		ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+						     page);
 	else
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+		ret2 = generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
 
-	err = ext4_journal_stop(handle);
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
+	ret2 = ext4_journal_stop(handle);
 	if (!ret)
-		ret = err;
-	return ret;
+		ret = ret2;
 
-out_fail:
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return ret;
+	return ret ? ret : copied;
 }
 
-static int ext4_journalled_writepage(struct page *page,
-				struct writeback_control *wbc)
+static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
+				   unsigned int length)
 {
-	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
 
-	if (ext4_journal_current_handle())
-		goto no_write;
+	ext4_da_page_release_reservation(page, offset, length);
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto no_write;
-	}
+out:
+	ext4_invalidatepage(page, offset, length);
+
+	return;
+}
+
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+	trace_ext4_alloc_da_blocks(inode);
+
+	if (!EXT4_I(inode)->i_reserved_data_blocks &&
+	    !EXT4_I(inode)->i_reserved_meta_blocks)
+		return 0;
+
+	/*
+	 * We do something simple for now.  The filemap_flush() will
+	 * also start triggering a write of the data blocks, which is
+	 * not strictly speaking necessary (and for users of
+	 * laptop_mode, not even desirable).  However, to do otherwise
+	 * would require replicating code paths in:
+	 *
+	 * ext4_writepages() ->
+	 *    write_cache_pages() ---> (via passed in callback function)
+	 *        __mpage_da_writepage() -->
+	 *           mpage_add_bh_to_extent()
+	 *           mpage_da_map_blocks()
+	 *
+	 * The problem is that write_cache_pages(), located in
+	 * mm/page-writeback.c, marks pages clean in preparation for
+	 * doing I/O, which is not desirable if we're not planning on
+	 * doing I/O at all.
+	 *
+	 * We could call write_cache_pages(), and then redirty all of
+	 * the pages by calling redirty_page_for_writepage() but that
+	 * would be ugly in the extreme.  So instead we would need to
+	 * replicate parts of the code in the above functions,
+	 * simplifying them because we wouldn't actually intend to
+	 * write out the pages, but rather only collect contiguous
+	 * logical block extents, call the multi-block allocator, and
+	 * then update the buffer heads with the block allocations.
+	 *
+	 * For now, though, we'll cheat by calling filemap_flush(),
+	 * which will map the blocks, and start the I/O, but not
+	 * actually wait for the I/O to complete.
+	 */
+	return filemap_flush(inode->i_mapping);
+}
 
-	if (!page_has_buffers(page) || PageChecked(page)) {
+/*
+ * bmap() is special.  It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal.  If somebody makes a swapfile on an ext4 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	journal_t *journal;
+	int err;
+
+	/*
+	 * We can get here for an inline file via the FIBMAP ioctl
+	 */
+	if (ext4_has_inline_data(inode))
+		return 0;
+
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+			test_opt(inode->i_sb, DELALLOC)) {
 		/*
-		 * It's mmapped pagecache.  Add buffers and journal it.  There
-		 * doesn't seem much point in redirtying the page here.
+		 * With delalloc we want to sync the file
+		 * so that we can make sure we allocate
+		 * blocks for file
 		 */
-		ClearPageChecked(page);
-		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext4_get_block);
-		if (ret != 0) {
-			ext4_journal_stop(handle);
-			goto out_unlock;
-		}
-		ret = walk_page_buffers(handle, page_buffers(page), 0,
-			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+		filemap_write_and_wait(mapping);
+	}
 
-		err = walk_page_buffers(handle, page_buffers(page), 0,
-				PAGE_CACHE_SIZE, NULL, write_end_fn);
-		if (ret == 0)
-			ret = err;
-		EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-		unlock_page(page);
-	} else {
+	if (EXT4_JOURNAL(inode) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
 		/*
-		 * It may be a page full of checkpoint-mode buffers.  We don't
-		 * really know unless we go poke around in the buffer_heads.
-		 * But block_write_full_page will do the right thing.
+		 * This is a REALLY heavyweight approach, but the use of
+		 * bmap on dirty files is expected to be extremely rare:
+		 * only if we run lilo or swapon on a freshly made file
+		 * do we expect this to happen.
+		 *
+		 * (bmap requires CAP_SYS_RAWIO so this does not
+		 * represent an unprivileged user DOS attack --- we'd be
+		 * in trouble if mortal users could trigger this path at
+		 * will.)
+		 *
+		 * NB. EXT4_STATE_JDATA is not set on files other than
+		 * regular files.  If somebody wants to bmap a directory
+		 * or symlink and gets confused because the buffer
+		 * hasn't yet been flushed to disk, they deserve
+		 * everything they get.
 		 */
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+
+		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
+		journal = EXT4_JOURNAL(inode);
+		jbd2_journal_lock_updates(journal);
+		err = jbd2_journal_flush(journal);
+		jbd2_journal_unlock_updates(journal);
+
+		if (err)
+			return 0;
 	}
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-out:
-	return ret;
 
-no_write:
-	redirty_page_for_writepage(wbc, page);
-out_unlock:
-	unlock_page(page);
-	goto out;
+	return generic_block_bmap(mapping, block, ext4_get_block);
 }
 
 static int ext4_readpage(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, ext4_get_block);
+	int ret = -EAGAIN;
+	struct inode *inode = page->mapping->host;
+
+	trace_ext4_readpage(page);
+
+	if (ext4_has_inline_data(inode))
+		ret = ext4_readpage_inline(inode, page);
+
+	if (ret == -EAGAIN)
+		return mpage_readpage(page, ext4_get_block);
+
+	return ret;
 }
 
 static int
 ext4_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
+	struct inode *inode = mapping->host;
+
+	/* If the file has inline data, no need to do readpages. */
+	if (ext4_has_inline_data(inode))
+		return 0;
+
 	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
 }
 
-static void ext4_invalidatepage(struct page *page, unsigned long offset)
+static void ext4_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
+{
+	trace_ext4_invalidatepage(page, offset, length);
+
+	/* No journalling happens on data buffers when this function is used */
+	WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+
+	block_invalidatepage(page, offset, length);
+}
+
+static int __ext4_journalled_invalidatepage(struct page *page,
+					    unsigned int offset,
+					    unsigned int length)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
+	trace_ext4_journalled_invalidatepage(page, offset, length);
+
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
-	if (offset == 0)
+	if (offset == 0 && length == PAGE_CACHE_SIZE)
 		ClearPageChecked(page);
 
-	jbd2_journal_invalidatepage(journal, page, offset);
+	return jbd2_journal_invalidatepage(journal, page, offset, length);
+}
+
+/* Wrapper for aops... */
+static void ext4_journalled_invalidatepage(struct page *page,
+					   unsigned int offset,
+					   unsigned int length)
+{
+	WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
 }
 
 static int ext4_releasepage(struct page *page, gfp_t wait)
 {
 	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
 
-	WARN_ON(PageChecked(page));
-	if (!page_has_buffers(page))
+	trace_ext4_releasepage(page);
+
+	/* Page has dirty journalled data -> cannot release */
+	if (PageChecked(page))
 		return 0;
-	return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	else
+		return try_to_free_buffers(page);
+}
+
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
+{
+	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+		   inode->i_ino, create);
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
+}
+
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
+{
+	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+		   inode->i_ino, create);
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_NO_LOCK);
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+			    ssize_t size, void *private)
+{
+        ext4_io_end_t *io_end = iocb->private;
+
+	/* if not async direct IO just return */
+	if (!io_end)
+		return;
+
+	ext_debug("ext4_end_io_dio(): io_end 0x%p "
+		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
+ 		  iocb->private, io_end->inode->i_ino, iocb, offset,
+		  size);
+
+	iocb->private = NULL;
+	io_end->offset = offset;
+	io_end->size = size;
+	ext4_put_io_end(io_end);
 }
 
 /*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unwritten
+ * If those blocks were preallocated, we mark sure they are split, but
+ * still keep the range to write as unwritten.
+ *
+ * The unwritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the conversion
+ * when async direct IO completed.
+ *
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
  *
- * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file. But current
- * VFS code falls back into buffered path in that case so we are safe.
  */
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
-			const struct iovec *iov, loff_t offset,
-			unsigned long nr_segs)
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+			      struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	handle_t *handle;
 	ssize_t ret;
-	int orphan = 0;
-	size_t count = iov_length(iov, nr_segs);
+	size_t count = iov_iter_count(iter);
+	int overwrite = 0;
+	get_block_t *get_block_func = NULL;
+	int dio_flags = 0;
+	loff_t final_size = offset + count;
+	ext4_io_end_t *io_end = NULL;
 
-	if (rw == WRITE) {
-		loff_t final_size = offset + count;
+	/* Use the old path for reads and writes beyond i_size. */
+	if (rw != WRITE || final_size > inode->i_size)
+		return ext4_ind_direct_IO(rw, iocb, iter, offset);
 
-		if (final_size > inode->i_size) {
-			/* Credits for sb + inode write */
-			handle = ext4_journal_start(inode, 2);
-			if (IS_ERR(handle)) {
-				ret = PTR_ERR(handle);
-				goto out;
-			}
-			ret = ext4_orphan_add(handle, inode);
-			if (ret) {
-				ext4_journal_stop(handle);
-				goto out;
-			}
-			orphan = 1;
-			ei->i_disksize = inode->i_size;
-			ext4_journal_stop(handle);
-		}
-	}
+	BUG_ON(iocb->private == NULL);
 
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				 offset, nr_segs,
-				 ext4_get_block, NULL);
+	/*
+	 * Make all waiters for direct IO properly wait also for extent
+	 * conversion. This also disallows race between truncate() and
+	 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+	 */
+	if (rw == WRITE)
+		atomic_inc(&inode->i_dio_count);
 
-	if (orphan) {
-		int err;
+	/* If we do a overwrite dio, i_mutex locking can be released */
+	overwrite = *((int *)iocb->private);
 
-		/* Credits for sb + inode write */
-		handle = ext4_journal_start(inode, 2);
-		if (IS_ERR(handle)) {
-			/* This is really bad luck. We've written the data
-			 * but cannot extend i_size. Bail out and pretend
-			 * the write failed... */
-			ret = PTR_ERR(handle);
-			goto out;
+	if (overwrite) {
+		down_read(&EXT4_I(inode)->i_data_sem);
+		mutex_unlock(&inode->i_mutex);
+	}
+
+	/*
+	 * We could direct write to holes and fallocate.
+	 *
+	 * Allocated blocks to fill the hole are marked as
+	 * unwritten to prevent parallel buffered read to expose
+	 * the stale data before DIO complete the data IO.
+	 *
+	 * As to previously fallocated extents, ext4 get_block will
+	 * just simply mark the buffer mapped but still keep the
+	 * extents unwritten.
+	 *
+	 * For non AIO case, we will convert those unwritten extents
+	 * to written after return back from blockdev_direct_IO.
+	 *
+	 * For async DIO, the conversion needs to be deferred when the
+	 * IO is completed. The ext4 end_io callback function will be
+	 * called to take care of the conversion work.  Here for async
+	 * case, we allocate an io_end structure to hook to the iocb.
+	 */
+	iocb->private = NULL;
+	ext4_inode_aio_set(inode, NULL);
+	if (!is_sync_kiocb(iocb)) {
+		io_end = ext4_init_io_end(inode, GFP_NOFS);
+		if (!io_end) {
+			ret = -ENOMEM;
+			goto retake_lock;
 		}
-		if (inode->i_nlink)
-			ext4_orphan_del(handle, inode);
-		if (ret > 0) {
-			loff_t end = offset + ret;
-			if (end > inode->i_size) {
-				ei->i_disksize = end;
-				i_size_write(inode, end);
-				/*
-				 * We're going to return a positive `ret'
-				 * here due to non-zero-length I/O, so there's
-				 * no way of reporting error returns from
-				 * ext4_mark_inode_dirty() to userspace.  So
-				 * ignore it.
-				 */
-				ext4_mark_inode_dirty(handle, inode);
-			}
+		/*
+		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
+		 */
+		iocb->private = ext4_get_io_end(io_end);
+		/*
+		 * we save the io structure for current async direct
+		 * IO, so that later ext4_map_blocks() could flag the
+		 * io structure whether there is a unwritten extents
+		 * needs to be converted when IO is completed.
+		 */
+		ext4_inode_aio_set(inode, io_end);
+	}
+
+	if (overwrite) {
+		get_block_func = ext4_get_block_write_nolock;
+	} else {
+		get_block_func = ext4_get_block_write;
+		dio_flags = DIO_LOCKING;
+	}
+	ret = __blockdev_direct_IO(rw, iocb, inode,
+				   inode->i_sb->s_bdev, iter,
+				   offset,
+				   get_block_func,
+				   ext4_end_io_dio,
+				   NULL,
+				   dio_flags);
+
+	/*
+	 * Put our reference to io_end. This can free the io_end structure e.g.
+	 * in sync IO case or in case of error. It can even perform extent
+	 * conversion if all bios we submitted finished before we got here.
+	 * Note that in that case iocb->private can be already set to NULL
+	 * here.
+	 */
+	if (io_end) {
+		ext4_inode_aio_set(inode, NULL);
+		ext4_put_io_end(io_end);
+		/*
+		 * When no IO was submitted ext4_end_io_dio() was not
+		 * called so we have to put iocb's reference.
+		 */
+		if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
+			WARN_ON(iocb->private != io_end);
+			WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+			ext4_put_io_end(io_end);
+			iocb->private = NULL;
 		}
-		err = ext4_journal_stop(handle);
-		if (ret == 0)
+	}
+	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+						EXT4_STATE_DIO_UNWRITTEN)) {
+		int err;
+		/*
+		 * for non AIO case, since the IO is already
+		 * completed, we could do the conversion right here
+		 */
+		err = ext4_convert_unwritten_extents(NULL, inode,
+						     offset, ret);
+		if (err < 0)
 			ret = err;
+		ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 	}
-out:
+
+retake_lock:
+	if (rw == WRITE)
+		inode_dio_done(inode);
+	/* take i_mutex locking again if we do a ovewrite dio */
+	if (overwrite) {
+		up_read(&EXT4_I(inode)->i_data_sem);
+		mutex_lock(&inode->i_mutex);
+	}
+
+	return ret;
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+			      struct iov_iter *iter, loff_t offset)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+
+	/*
+	 * If we are doing data journalling we don't support O_DIRECT
+	 */
+	if (ext4_should_journal_data(inode))
+		return 0;
+
+	/* Let buffer I/O handle the inline data case. */
+	if (ext4_has_inline_data(inode))
+		return 0;
+
+	trace_ext4_direct_IO_enter(inode, offset, count, rw);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
+	else
+		ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+	trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
 	return ret;
 }
 
@@ -1804,88 +3274,110 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static const struct address_space_operations ext4_ordered_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_ordered_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_write_begin,
-	.write_end	= ext4_ordered_write_end,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_invalidatepage,
-	.releasepage	= ext4_releasepage,
-	.direct_IO	= ext4_direct_IO,
-	.migratepage	= buffer_migrate_page,
+static const struct address_space_operations ext4_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
-static const struct address_space_operations ext4_writeback_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_writeback_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_write_begin,
-	.write_end	= ext4_writeback_write_end,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_invalidatepage,
-	.releasepage	= ext4_releasepage,
-	.direct_IO	= ext4_direct_IO,
-	.migratepage	= buffer_migrate_page,
+static const struct address_space_operations ext4_journalled_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
+	.write_begin		= ext4_write_begin,
+	.write_end		= ext4_journalled_write_end,
+	.set_page_dirty		= ext4_journalled_set_page_dirty,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_journalled_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
-static const struct address_space_operations ext4_journalled_aops = {
-	.readpage	= ext4_readpage,
-	.readpages	= ext4_readpages,
-	.writepage	= ext4_journalled_writepage,
-	.sync_page	= block_sync_page,
-	.write_begin	= ext4_write_begin,
-	.write_end	= ext4_journalled_write_end,
-	.set_page_dirty	= ext4_journalled_set_page_dirty,
-	.bmap		= ext4_bmap,
-	.invalidatepage	= ext4_invalidatepage,
-	.releasepage	= ext4_releasepage,
+static const struct address_space_operations ext4_da_aops = {
+	.readpage		= ext4_readpage,
+	.readpages		= ext4_readpages,
+	.writepage		= ext4_writepage,
+	.writepages		= ext4_writepages,
+	.write_begin		= ext4_da_write_begin,
+	.write_end		= ext4_da_write_end,
+	.bmap			= ext4_bmap,
+	.invalidatepage		= ext4_da_invalidatepage,
+	.releasepage		= ext4_releasepage,
+	.direct_IO		= ext4_direct_IO,
+	.migratepage		= buffer_migrate_page,
+	.is_partially_uptodate  = block_is_partially_uptodate,
+	.error_remove_page	= generic_error_remove_page,
 };
 
 void ext4_set_aops(struct inode *inode)
 {
-	if (ext4_should_order_data(inode))
-		inode->i_mapping->a_ops = &ext4_ordered_aops;
-	else if (ext4_should_writeback_data(inode))
-		inode->i_mapping->a_ops = &ext4_writeback_aops;
-	else
+	switch (ext4_inode_journal_mode(inode)) {
+	case EXT4_INODE_ORDERED_DATA_MODE:
+		ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+		break;
+	case EXT4_INODE_WRITEBACK_DATA_MODE:
+		ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE);
+		break;
+	case EXT4_INODE_JOURNAL_DATA_MODE:
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
+		return;
+	default:
+		BUG();
+	}
+	if (test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
+	else
+		inode->i_mapping->a_ops = &ext4_aops;
 }
 
 /*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
  */
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
-		struct address_space *mapping, loff_t from)
+static int ext4_block_zero_page_range(handle_t *handle,
+		struct address_space *mapping, loff_t from, loff_t length)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	unsigned blocksize, length, pos;
+	unsigned blocksize, max, pos;
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
+	struct page *page;
 	int err = 0;
 
+	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+				   mapping_gfp_mask(mapping) & ~__GFP_FS);
+	if (!page)
+		return -ENOMEM;
+
 	blocksize = inode->i_sb->s_blocksize;
-	length = blocksize - (offset & (blocksize - 1));
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+	max = blocksize - (offset & (blocksize - 1));
 
 	/*
-	 * For "nobh" option,  we can only work if we don't need to
-	 * read-in the page - otherwise we create buffers to do the IO.
+	 * correct length if it does not fall between
+	 * 'from' and the end of the block
 	 */
-	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
-	     ext4_should_writeback_data(inode) && PageUptodate(page)) {
-		zero_user(page, offset, length);
-		set_page_dirty(page);
-		goto unlock;
-	}
+	if (length > max || length < 0)
+		length = max;
+
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
@@ -1898,13 +3390,10 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 		iblock++;
 		pos += blocksize;
 	}
-
-	err = 0;
 	if (buffer_freed(bh)) {
 		BUFFER_TRACE(bh, "freed: skip");
 		goto unlock;
 	}
-
 	if (!buffer_mapped(bh)) {
 		BUFFER_TRACE(bh, "unmapped");
 		ext4_get_block(inode, iblock, bh, 0);
@@ -1927,25 +3416,22 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 		if (!buffer_uptodate(bh))
 			goto unlock;
 	}
-
 	if (ext4_should_journal_data(inode)) {
 		BUFFER_TRACE(bh, "get write access");
 		err = ext4_journal_get_write_access(handle, bh);
 		if (err)
 			goto unlock;
 	}
-
 	zero_user(page, offset, length);
-
 	BUFFER_TRACE(bh, "zeroed end of block");
 
-	err = 0;
 	if (ext4_should_journal_data(inode)) {
-		err = ext4_journal_dirty_metadata(handle, bh);
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
 	} else {
-		if (ext4_should_order_data(inode))
-			err = ext4_journal_dirty_data(handle, bh);
+		err = 0;
 		mark_buffer_dirty(bh);
+		if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
+			err = ext4_jbd2_file_inode(handle, inode);
 	}
 
 unlock:
@@ -1955,342 +3441,235 @@ unlock:
 }
 
 /*
- * Probably it should be a library function... search for first non-zero word
- * or memcmp with zero_page, whatever is better for particular architecture.
- * Linus?
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
  */
-static inline int all_zeroes(__le32 *p, __le32 *q)
+static int ext4_block_truncate_page(handle_t *handle,
+		struct address_space *mapping, loff_t from)
 {
-	while (p < q)
-		if (*p++)
-			return 0;
-	return 1;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned length;
+	unsigned blocksize;
+	struct inode *inode = mapping->host;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+
+	return ext4_block_zero_page_range(handle, mapping, from, length);
 }
 
-/**
- *	ext4_find_shared - find the indirect blocks for partial truncation.
- *	@inode:	  inode in question
- *	@depth:	  depth of the affected branch
- *	@offsets: offsets of pointers in that branch (see ext4_block_to_path)
- *	@chain:	  place to store the pointers to partial indirect blocks
- *	@top:	  place to the (detached) top of branch
- *
- *	This is a helper function used by ext4_truncate().
- *
- *	When we do truncate() we may have to clean the ends of several
- *	indirect blocks but leave the blocks themselves alive. Block is
- *	partially truncated if some data below the new i_size is refered
- *	from it (and it is on the path to the first completely truncated
- *	data block, indeed).  We have to free the top of that path along
- *	with everything to the right of the path. Since no allocation
- *	past the truncation point is possible until ext4_truncate()
- *	finishes, we may safely do the latter, but top of branch may
- *	require special attention - pageout below the truncation point
- *	might try to populate it.
- *
- *	We atomically detach the top of branch from the tree, store the
- *	block number of its root in *@top, pointers to buffer_heads of
- *	partially truncated blocks - in @chain[].bh and pointers to
- *	their last elements that should not be removed - in
- *	@chain[].p. Return value is the pointer to last filled element
- *	of @chain.
- *
- *	The work left to caller to do the actual freeing of subtrees:
- *		a) free the subtree starting from *@top
- *		b) free the subtrees whose roots are stored in
- *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
- *		c) free the subtrees growing from the inode past the @chain[0].
- *			(no partially truncated stuff there).  */
-
-static Indirect *ext4_find_shared(struct inode *inode, int depth,
-			ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
-{
-	Indirect *partial, *p;
-	int k, err;
-
-	*top = 0;
-	/* Make k index the deepest non-null offest + 1 */
-	for (k = depth; k > 1 && !offsets[k-1]; k--)
-		;
-	partial = ext4_get_branch(inode, k, offsets, chain, &err);
-	/* Writer: pointers */
-	if (!partial)
-		partial = chain + k-1;
-	/*
-	 * If the branch acquired continuation since we've looked at it -
-	 * fine, it should all survive and (new) top doesn't belong to us.
-	 */
-	if (!partial->key && *partial->p)
-		/* Writer: end */
-		goto no_top;
-	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
-		;
-	/*
-	 * OK, we've found the last block that must survive. The rest of our
-	 * branch should be detached before unlocking. However, if that rest
-	 * of branch is all ours and does not grow immediately from the inode
-	 * it's easier to cheat and just decrement partial->p.
-	 */
-	if (p == chain + k - 1 && p > chain) {
-		p->p--;
-	} else {
-		*top = *p->p;
-		/* Nope, don't do this in ext4.  Must leave the tree intact */
-#if 0
-		*p->p = 0;
-#endif
-	}
-	/* Writer: end */
+int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+			     loff_t lstart, loff_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned partial_start, partial_end;
+	ext4_fsblk_t start, end;
+	loff_t byte_end = (lstart + length - 1);
+	int err = 0;
+
+	partial_start = lstart & (sb->s_blocksize - 1);
+	partial_end = byte_end & (sb->s_blocksize - 1);
+
+	start = lstart >> sb->s_blocksize_bits;
+	end = byte_end >> sb->s_blocksize_bits;
 
-	while(partial > p) {
-		brelse(partial->bh);
-		partial--;
+	/* Handle partial zero within the single block */
+	if (start == end &&
+	    (partial_start || (partial_end != sb->s_blocksize - 1))) {
+		err = ext4_block_zero_page_range(handle, mapping,
+						 lstart, length);
+		return err;
+	}
+	/* Handle partial zero out on the start of the range */
+	if (partial_start) {
+		err = ext4_block_zero_page_range(handle, mapping,
+						 lstart, sb->s_blocksize);
+		if (err)
+			return err;
 	}
-no_top:
-	return partial;
+	/* Handle partial zero out on the end of the range */
+	if (partial_end != sb->s_blocksize - 1)
+		err = ext4_block_zero_page_range(handle, mapping,
+						 byte_end - partial_end,
+						 partial_end + 1);
+	return err;
+}
+
+int ext4_can_truncate(struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext4_inode_is_fast_symlink(inode);
+	return 0;
 }
 
 /*
- * Zero a number of block pointers in either an inode or an indirect block.
- * If we restart the transaction we must again get write access to the
- * indirect block for further modification.
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode:  File inode
+ * @offset: The offset where the hole will begin
+ * @len:    The length of the hole
  *
- * We release `count' blocks on disk, but (last - first) may be greater
- * than `count' because there can be holes in there.
+ * Returns: 0 on success or negative on failure
  */
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
-		struct buffer_head *bh, ext4_fsblk_t block_to_free,
-		unsigned long count, __le32 *first, __le32 *last)
-{
-	__le32 *p;
-	if (try_to_extend_transaction(handle, inode)) {
-		if (bh) {
-			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-			ext4_journal_dirty_metadata(handle, bh);
-		}
-		ext4_mark_inode_dirty(handle, inode);
-		ext4_journal_test_restart(handle, inode);
-		if (bh) {
-			BUFFER_TRACE(bh, "retaking write access");
-			ext4_journal_get_write_access(handle, bh);
-		}
-	}
+
+int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+{
+	struct super_block *sb = inode->i_sb;
+	ext4_lblk_t first_block, stop_block;
+	struct address_space *mapping = inode->i_mapping;
+	loff_t first_block_offset, last_block_offset;
+	handle_t *handle;
+	unsigned int credits;
+	int ret = 0;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	trace_ext4_punch_hole(inode, offset, length, 0);
 
 	/*
-	 * Any buffers which are on the journal will be in memory. We find
-	 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
-	 * on them.  We've already detached each block from the file, so
-	 * bforget() in jbd2_journal_forget() should be safe.
-	 *
-	 * AKPM: turn on bforget in jbd2_journal_forget()!!!
+	 * Write out all dirty pages to avoid race conditions
+	 * Then release them.
 	 */
-	for (p = first; p < last; p++) {
-		u32 nr = le32_to_cpu(*p);
-		if (nr) {
-			struct buffer_head *tbh;
-
-			*p = 0;
-			tbh = sb_find_get_block(inode->i_sb, nr);
-			ext4_forget(handle, 0, inode, tbh, nr);
-		}
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		ret = filemap_write_and_wait_range(mapping, offset,
+						   offset + length - 1);
+		if (ret)
+			return ret;
 	}
 
-	ext4_free_blocks(handle, inode, block_to_free, count, 0);
-}
+	mutex_lock(&inode->i_mutex);
 
-/**
- * ext4_free_data - free a list of data blocks
- * @handle:	handle for this transaction
- * @inode:	inode we are dealing with
- * @this_bh:	indirect buffer_head which contains *@first and *@last
- * @first:	array of block numbers
- * @last:	points immediately past the end of array
- *
- * We are freeing all blocks refered from that array (numbers are stored as
- * little-endian 32-bit) and updating @inode->i_blocks appropriately.
- *
- * We accumulate contiguous runs of blocks to free.  Conveniently, if these
- * blocks are contiguous then releasing them at one time will only affect one
- * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
- * actually use a lot of journal space.
- *
- * @this_bh will be %NULL if @first and @last point into the inode's direct
- * block pointers.
- */
-static void ext4_free_data(handle_t *handle, struct inode *inode,
-			   struct buffer_head *this_bh,
-			   __le32 *first, __le32 *last)
-{
-	ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
-	unsigned long count = 0;	    /* Number of blocks in the run */
-	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
-					       corresponding to
-					       block_to_free */
-	ext4_fsblk_t nr;		    /* Current block # */
-	__le32 *p;			    /* Pointer into inode/ind
-					       for current block */
-	int err;
+	/* No need to punch hole beyond i_size */
+	if (offset >= inode->i_size)
+		goto out_mutex;
 
-	if (this_bh) {				/* For indirect block */
-		BUFFER_TRACE(this_bh, "get_write_access");
-		err = ext4_journal_get_write_access(handle, this_bh);
-		/* Important: if we can't update the indirect pointers
-		 * to the blocks, we can't free them. */
-		if (err)
-			return;
+	/*
+	 * If the hole extends beyond i_size, set the hole
+	 * to end after the page that contains i_size
+	 */
+	if (offset + length > inode->i_size) {
+		length = inode->i_size +
+		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+		   offset;
 	}
 
-	for (p = first; p < last; p++) {
-		nr = le32_to_cpu(*p);
-		if (nr) {
-			/* accumulate blocks to free if they're contiguous */
-			if (count == 0) {
-				block_to_free = nr;
-				block_to_free_p = p;
-				count = 1;
-			} else if (nr == block_to_free + count) {
-				count++;
-			} else {
-				ext4_clear_blocks(handle, inode, this_bh,
-						  block_to_free,
-						  count, block_to_free_p, p);
-				block_to_free = nr;
-				block_to_free_p = p;
-				count = 1;
-			}
-		}
+	if (offset & (sb->s_blocksize - 1) ||
+	    (offset + length) & (sb->s_blocksize - 1)) {
+		/*
+		 * Attach jinode to inode for jbd2 if we do any zeroing of
+		 * partial block
+		 */
+		ret = ext4_inode_attach_jinode(inode);
+		if (ret < 0)
+			goto out_mutex;
+
 	}
 
-	if (count > 0)
-		ext4_clear_blocks(handle, inode, this_bh, block_to_free,
-				  count, block_to_free_p, p);
+	first_block_offset = round_up(offset, sb->s_blocksize);
+	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+
+	/* Now release the pages and zero block aligned part of pages*/
+	if (last_block_offset > first_block_offset)
+		truncate_pagecache_range(inode, first_block_offset,
+					 last_block_offset);
 
-	if (this_bh) {
-		BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, this_bh);
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		credits = ext4_writepage_trans_blocks(inode);
+	else
+		credits = ext4_blocks_for_truncate(inode);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		ext4_std_error(sb, ret);
+		goto out_dio;
 	}
-}
 
-/**
- *	ext4_free_branches - free an array of branches
- *	@handle: JBD handle for this transaction
- *	@inode:	inode we are dealing with
- *	@parent_bh: the buffer_head which contains *@first and *@last
- *	@first:	array of block numbers
- *	@last:	pointer immediately past the end of array
- *	@depth:	depth of the branches to free
- *
- *	We are freeing all blocks refered from these branches (numbers are
- *	stored as little-endian 32-bit) and updating @inode->i_blocks
- *	appropriately.
- */
-static void ext4_free_branches(handle_t *handle, struct inode *inode,
-			       struct buffer_head *parent_bh,
-			       __le32 *first, __le32 *last, int depth)
-{
-	ext4_fsblk_t nr;
-	__le32 *p;
+	ret = ext4_zero_partial_blocks(handle, inode, offset,
+				       length);
+	if (ret)
+		goto out_stop;
 
-	if (is_handle_aborted(handle))
-		return;
+	first_block = (offset + sb->s_blocksize - 1) >>
+		EXT4_BLOCK_SIZE_BITS(sb);
+	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
 
-	if (depth--) {
-		struct buffer_head *bh;
-		int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-		p = last;
-		while (--p >= first) {
-			nr = le32_to_cpu(*p);
-			if (!nr)
-				continue;		/* A hole */
+	/* If there are no blocks to remove, return now */
+	if (first_block >= stop_block)
+		goto out_stop;
 
-			/* Go read the buffer for the next level down */
-			bh = sb_bread(inode->i_sb, nr);
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_discard_preallocations(inode);
 
-			/*
-			 * A read failure? Report error and clear slot
-			 * (should be rare).
-			 */
-			if (!bh) {
-				ext4_error(inode->i_sb, "ext4_free_branches",
-					   "Read failure, inode=%lu, block=%llu",
-					   inode->i_ino, nr);
-				continue;
-			}
+	ret = ext4_es_remove_extent(inode, first_block,
+				    stop_block - first_block);
+	if (ret) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto out_stop;
+	}
 
-			/* This zaps the entire block.  Bottom up. */
-			BUFFER_TRACE(bh, "free child branches");
-			ext4_free_branches(handle, inode, bh,
-					   (__le32*)bh->b_data,
-					   (__le32*)bh->b_data + addr_per_block,
-					   depth);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ret = ext4_ext_remove_space(inode, first_block,
+					    stop_block - 1);
+	else
+		ret = ext4_free_hole_blocks(handle, inode, first_block,
+					    stop_block);
 
-			/*
-			 * We've probably journalled the indirect block several
-			 * times during the truncate.  But it's no longer
-			 * needed and we now drop it from the transaction via
-			 * jbd2_journal_revoke().
-			 *
-			 * That's easy if it's exclusively part of this
-			 * transaction.  But if it's part of the committing
-			 * transaction then jbd2_journal_forget() will simply
-			 * brelse() it.  That means that if the underlying
-			 * block is reallocated in ext4_get_block(),
-			 * unmap_underlying_metadata() will find this block
-			 * and will try to get rid of it.  damn, damn.
-			 *
-			 * If this block has already been committed to the
-			 * journal, a revoke record will be written.  And
-			 * revoke records must be emitted *before* clearing
-			 * this block's bit in the bitmaps.
-			 */
-			ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
 
-			/*
-			 * Everything below this this pointer has been
-			 * released.  Now let this top-of-subtree go.
-			 *
-			 * We want the freeing of this indirect block to be
-			 * atomic in the journal with the updating of the
-			 * bitmap block which owns it.  So make some room in
-			 * the journal.
-			 *
-			 * We zero the parent pointer *after* freeing its
-			 * pointee in the bitmaps, so if extend_transaction()
-			 * for some reason fails to put the bitmap changes and
-			 * the release into the same transaction, recovery
-			 * will merely complain about releasing a free block,
-			 * rather than leaking blocks.
-			 */
-			if (is_handle_aborted(handle))
-				return;
-			if (try_to_extend_transaction(handle, inode)) {
-				ext4_mark_inode_dirty(handle, inode);
-				ext4_journal_test_restart(handle, inode);
-			}
+	/* Now release the pages again to reduce race window */
+	if (last_block_offset > first_block_offset)
+		truncate_pagecache_range(inode, first_block_offset,
+					 last_block_offset);
 
-			ext4_free_blocks(handle, inode, nr, 1, 1);
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
+out_stop:
+	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
 
-			if (parent_bh) {
-				/*
-				 * The block which we have just freed is
-				 * pointed to by an indirect block: journal it
-				 */
-				BUFFER_TRACE(parent_bh, "get_write_access");
-				if (!ext4_journal_get_write_access(handle,
-								   parent_bh)){
-					*p = 0;
-					BUFFER_TRACE(parent_bh,
-					"call ext4_journal_dirty_metadata");
-					ext4_journal_dirty_metadata(handle,
-								    parent_bh);
-				}
-			}
+int ext4_inode_attach_jinode(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct jbd2_inode *jinode;
+
+	if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
+		return 0;
+
+	jinode = jbd2_alloc_inode(GFP_KERNEL);
+	spin_lock(&inode->i_lock);
+	if (!ei->jinode) {
+		if (!jinode) {
+			spin_unlock(&inode->i_lock);
+			return -ENOMEM;
 		}
-	} else {
-		/* We have reached the bottom of the tree. */
-		BUFFER_TRACE(parent_bh, "free data blocks");
-		ext4_free_data(handle, inode, parent_bh, first, last);
+		ei->jinode = jinode;
+		jbd2_journal_init_jbd_inode(ei->jinode, inode);
+		jinode = NULL;
 	}
+	spin_unlock(&inode->i_lock);
+	if (unlikely(jinode != NULL))
+		jbd2_free_inode(jinode);
+	return 0;
 }
 
 /*
@@ -2300,7 +3679,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
  * transaction, and VFS/VM ensures that ext4_truncate() cannot run
  * simultaneously on behalf of the same inode.
  *
- * As we work through the truncate and commmit bits of it to the journal there
+ * As we work through the truncate and commit bits of it to the journal there
  * is one core, guiding principle: the file's tree must always be consistent on
  * disk.  We must be able to restart the truncate after a crash.
  *
@@ -2323,73 +3702,61 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
  */
 void ext4_truncate(struct inode *inode)
 {
-	handle_t *handle;
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	__le32 *i_data = ei->i_data;
-	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	unsigned int credits;
+	handle_t *handle;
 	struct address_space *mapping = inode->i_mapping;
-	ext4_lblk_t offsets[4];
-	Indirect chain[4];
-	Indirect *partial;
-	__le32 nr = 0;
-	int n;
-	ext4_lblk_t last_block;
-	unsigned blocksize = inode->i_sb->s_blocksize;
-	struct page *page;
-
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	    S_ISLNK(inode->i_mode)))
-		return;
-	if (ext4_inode_is_fast_symlink(inode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-		return;
 
 	/*
-	 * We have to lock the EOF page here, because lock_page() nests
-	 * outside jbd2_journal_start().
+	 * There is a possibility that we're either freeing the inode
+	 * or it's a completely new inode. In those cases we might not
+	 * have i_mutex locked because it's not necessary.
 	 */
-	if ((inode->i_size & (blocksize - 1)) == 0) {
-		/* Block boundary? Nothing to do */
-		page = NULL;
-	} else {
-		page = grab_cache_page(mapping,
-				inode->i_size >> PAGE_CACHE_SHIFT);
-		if (!page)
-			return;
-	}
+	if (!(inode->i_state & (I_NEW|I_FREEING)))
+		WARN_ON(!mutex_is_locked(&inode->i_mutex));
+	trace_ext4_truncate_enter(inode);
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		ext4_ext_truncate(inode, page);
+	if (!ext4_can_truncate(inode))
 		return;
+
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+
+	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+
+	if (ext4_has_inline_data(inode)) {
+		int has_inline = 1;
+
+		ext4_inline_data_truncate(inode, &has_inline);
+		if (has_inline)
+			return;
 	}
 
-	handle = start_transaction(inode);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
-		return;		/* AKPM: return what? */
+	/* If we zero-out tail of the page, we have to create jinode for jbd2 */
+	if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
+		if (ext4_inode_attach_jinode(inode) < 0)
+			return;
 	}
 
-	last_block = (inode->i_size + blocksize-1)
-					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		credits = ext4_writepage_trans_blocks(inode);
+	else
+		credits = ext4_blocks_for_truncate(inode);
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ext4_std_error(inode->i_sb, PTR_ERR(handle));
+		return;
+	}
 
-	n = ext4_block_to_path(inode, last_block, offsets, NULL);
-	if (n == 0)
-		goto out_stop;	/* error */
+	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
 
 	/*
-	 * OK.  This truncate is going to happen.  We add the inode to the
-	 * orphan list, so that if this truncate spans multiple transactions,
-	 * and we crash, we will resume the truncate when the filesystem
-	 * recovers.  It also marks the inode dirty, to catch the new size.
+	 * We add the inode to the orphan list, so that if this
+	 * truncate spans multiple transactions, and we crash, we will
+	 * resume the truncate when the filesystem recovers.  It also
+	 * marks the inode dirty, to catch the new size.
 	 *
 	 * Implication: the file must always be in a sane, consistent
 	 * truncatable state while each transaction commits.
@@ -2397,96 +3764,23 @@ void ext4_truncate(struct inode *inode)
 	if (ext4_orphan_add(handle, inode))
 		goto out_stop;
 
-	/*
-	 * The orphan list entry will now protect us from any crash which
-	 * occurs before the truncate completes, so it is now safe to propagate
-	 * the new, shorter inode size (held for now in i_size) into the
-	 * on-disk inode. We do this via i_disksize, which is the value which
-	 * ext4 *really* writes onto the disk inode.
-	 */
-	ei->i_disksize = inode->i_size;
-
-	/*
-	 * From here we block out all ext4_get_block() callers who want to
-	 * modify the block allocation tree.
-	 */
-	down_write(&ei->i_data_sem);
+	down_write(&EXT4_I(inode)->i_data_sem);
 
-	if (n == 1) {		/* direct blocks */
-		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
-			       i_data + EXT4_NDIR_BLOCKS);
-		goto do_indirects;
-	}
+	ext4_discard_preallocations(inode);
 
-	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
-	/* Kill the top of shared branch (not detached) */
-	if (nr) {
-		if (partial == chain) {
-			/* Shared branch grows from the inode */
-			ext4_free_branches(handle, inode, NULL,
-					   &nr, &nr+1, (chain+n-1) - partial);
-			*partial->p = 0;
-			/*
-			 * We mark the inode dirty prior to restart,
-			 * and prior to stop.  No need for it here.
-			 */
-		} else {
-			/* Shared branch grows from an indirect block */
-			BUFFER_TRACE(partial->bh, "get_write_access");
-			ext4_free_branches(handle, inode, partial->bh,
-					partial->p,
-					partial->p+1, (chain+n-1) - partial);
-		}
-	}
-	/* Clear the ends of indirect blocks on the shared branch */
-	while (partial > chain) {
-		ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
-				   (__le32*)partial->bh->b_data+addr_per_block,
-				   (chain+n-1) - partial);
-		BUFFER_TRACE(partial->bh, "call brelse");
-		brelse (partial->bh);
-		partial--;
-	}
-do_indirects:
-	/* Kill the remaining (whole) subtrees */
-	switch (offsets[0]) {
-	default:
-		nr = i_data[EXT4_IND_BLOCK];
-		if (nr) {
-			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
-			i_data[EXT4_IND_BLOCK] = 0;
-		}
-	case EXT4_IND_BLOCK:
-		nr = i_data[EXT4_DIND_BLOCK];
-		if (nr) {
-			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
-			i_data[EXT4_DIND_BLOCK] = 0;
-		}
-	case EXT4_DIND_BLOCK:
-		nr = i_data[EXT4_TIND_BLOCK];
-		if (nr) {
-			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
-			i_data[EXT4_TIND_BLOCK] = 0;
-		}
-	case EXT4_TIND_BLOCK:
-		;
-	}
-
-	ext4_discard_reservation(inode);
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+		ext4_ext_truncate(handle, inode);
+	else
+		ext4_ind_truncate(handle, inode);
 
 	up_write(&ei->i_data_sem);
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
 
-	/*
-	 * In a multi-transaction truncate, we only make the final transaction
-	 * synchronous
-	 */
 	if (IS_SYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
+
 out_stop:
 	/*
-	 * If this was a simple ftruncate(), and the file will remain alive
+	 * If this was a simple ftruncate() and the file will remain alive,
 	 * then we need to clear up the orphan record which we created above.
 	 * However, if this was a real unlink then we were called by
 	 * ext4_delete_inode(), and we allow that function to clean up the
@@ -2495,56 +3789,11 @@ out_stop:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
+	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
-}
 
-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
-		unsigned long ino, struct ext4_iloc *iloc)
-{
-	unsigned long desc, group_desc;
-	ext4_group_t block_group;
-	unsigned long offset;
-	ext4_fsblk_t block;
-	struct buffer_head *bh;
-	struct ext4_group_desc * gdp;
-
-	if (!ext4_valid_inum(sb, ino)) {
-		/*
-		 * This error is already checked for in namei.c unless we are
-		 * looking at an NFS filehandle, in which case no error
-		 * report is needed
-		 */
-		return 0;
-	}
-
-	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
-	if (block_group >= EXT4_SB(sb)->s_groups_count) {
-		ext4_error(sb,"ext4_get_inode_block","group >= groups count");
-		return 0;
-	}
-	smp_rmb();
-	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
-	desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-	bh = EXT4_SB(sb)->s_group_desc[group_desc];
-	if (!bh) {
-		ext4_error (sb, "ext4_get_inode_block",
-			    "Descriptor not loaded");
-		return 0;
-	}
-
-	gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
-		desc * EXT4_DESC_SIZE(sb));
-	/*
-	 * Figure out the offset within the block group inode table
-	 */
-	offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
-		EXT4_INODE_SIZE(sb);
-	block = ext4_inode_table(sb, gdp) +
-		(offset >> EXT4_BLOCK_SIZE_BITS(sb));
-
-	iloc->block_group = block_group;
-	iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
-	return block;
+	trace_ext4_truncate_exit(inode);
 }
 
 /*
@@ -2556,23 +3805,45 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
 static int __ext4_get_inode_loc(struct inode *inode,
 				struct ext4_iloc *iloc, int in_mem)
 {
-	ext4_fsblk_t block;
-	struct buffer_head *bh;
-
-	block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
-	if (!block)
+	struct ext4_group_desc	*gdp;
+	struct buffer_head	*bh;
+	struct super_block	*sb = inode->i_sb;
+	ext4_fsblk_t		block;
+	int			inodes_per_block, inode_offset;
+
+	iloc->bh = NULL;
+	if (!ext4_valid_inum(sb, inode->i_ino))
 		return -EIO;
 
-	bh = sb_getblk(inode->i_sb, block);
-	if (!bh) {
-		ext4_error (inode->i_sb, "ext4_get_inode_loc",
-				"unable to read inode block - "
-				"inode=%lu, block=%llu",
-				 inode->i_ino, block);
+	iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
 		return -EIO;
-	}
+
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	inode_offset = ((inode->i_ino - 1) %
+			EXT4_INODES_PER_GROUP(sb));
+	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	bh = sb_getblk(sb, block);
+	if (unlikely(!bh))
+		return -ENOMEM;
 	if (!buffer_uptodate(bh)) {
 		lock_buffer(bh);
+
+		/*
+		 * If the buffer has the write error flag, we have failed
+		 * to write out another inode in the same block.  In this
+		 * case, we don't have to read the block because we may
+		 * read the old inode data successfully.
+		 */
+		if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+			set_buffer_uptodate(bh);
+
 		if (buffer_uptodate(bh)) {
 			/* someone brought it uptodate while we waited */
 			unlock_buffer(bh);
@@ -2586,29 +3857,13 @@ static int __ext4_get_inode_loc(struct inode *inode,
 		 */
 		if (in_mem) {
 			struct buffer_head *bitmap_bh;
-			struct ext4_group_desc *desc;
-			int inodes_per_buffer;
-			int inode_offset, i;
-			ext4_group_t block_group;
-			int start;
-
-			block_group = (inode->i_ino - 1) /
-					EXT4_INODES_PER_GROUP(inode->i_sb);
-			inodes_per_buffer = bh->b_size /
-				EXT4_INODE_SIZE(inode->i_sb);
-			inode_offset = ((inode->i_ino - 1) %
-					EXT4_INODES_PER_GROUP(inode->i_sb));
-			start = inode_offset & ~(inodes_per_buffer - 1);
+			int i, start;
 
-			/* Is the inode bitmap in cache? */
-			desc = ext4_get_group_desc(inode->i_sb,
-						block_group, NULL);
-			if (!desc)
-				goto make_io;
+			start = inode_offset & ~(inodes_per_block - 1);
 
-			bitmap_bh = sb_getblk(inode->i_sb,
-				ext4_inode_bitmap(inode->i_sb, desc));
-			if (!bitmap_bh)
+			/* Is the inode bitmap in cache? */
+			bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
+			if (unlikely(!bitmap_bh))
 				goto make_io;
 
 			/*
@@ -2620,14 +3875,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
 				brelse(bitmap_bh);
 				goto make_io;
 			}
-			for (i = start; i < start + inodes_per_buffer; i++) {
+			for (i = start; i < start + inodes_per_block; i++) {
 				if (i == inode_offset)
 					continue;
 				if (ext4_test_bit(i, bitmap_bh->b_data))
 					break;
 			}
 			brelse(bitmap_bh);
-			if (i == start + inodes_per_buffer) {
+			if (i == start + inodes_per_block) {
 				/* all other inodes are free, so skip I/O */
 				memset(bh->b_data, 0, bh->b_size);
 				set_buffer_uptodate(bh);
@@ -2638,19 +3893,43 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
 make_io:
 		/*
+		 * If we need to do any I/O, try to pre-readahead extra
+		 * blocks from the inode table.
+		 */
+		if (EXT4_SB(sb)->s_inode_readahead_blks) {
+			ext4_fsblk_t b, end, table;
+			unsigned num;
+			__u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
+
+			table = ext4_inode_table(sb, gdp);
+			/* s_inode_readahead_blks is always a power of 2 */
+			b = block & ~((ext4_fsblk_t) ra_blks - 1);
+			if (table > b)
+				b = table;
+			end = b + ra_blks;
+			num = EXT4_INODES_PER_GROUP(sb);
+			if (ext4_has_group_desc_csum(sb))
+				num -= ext4_itable_unused_count(sb, gdp);
+			table += num / inodes_per_block;
+			if (end > table)
+				end = table;
+			while (b <= end)
+				sb_breadahead(sb, b++);
+		}
+
+		/*
 		 * There are other valid inodes in the buffer, this inode
 		 * has in-inode xattrs, or we don't have this inode in memory.
 		 * Read the block from disk.
 		 */
+		trace_ext4_load_inode(inode);
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
-		submit_bh(READ_META, bh);
+		submit_bh(READ | REQ_META | REQ_PRIO, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			ext4_error(inode->i_sb, "ext4_get_inode_loc",
-					"unable to read inode block - "
-					"inode=%lu, block=%llu",
-					inode->i_ino, block);
+			EXT4_ERROR_INODE_BLOCK(inode, block,
+					       "unable to read itable block");
 			brelse(bh);
 			return -EIO;
 		}
@@ -2664,46 +3943,55 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 {
 	/* We have all inode data except xattrs in memory here. */
 	return __ext4_get_inode_loc(inode, iloc,
-		!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+		!ext4_test_inode_state(inode, EXT4_STATE_XATTR));
 }
 
 void ext4_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = EXT4_I(inode)->i_flags;
+	unsigned int new_fl = 0;
 
-	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 	if (flags & EXT4_SYNC_FL)
-		inode->i_flags |= S_SYNC;
+		new_fl |= S_SYNC;
 	if (flags & EXT4_APPEND_FL)
-		inode->i_flags |= S_APPEND;
+		new_fl |= S_APPEND;
 	if (flags & EXT4_IMMUTABLE_FL)
-		inode->i_flags |= S_IMMUTABLE;
+		new_fl |= S_IMMUTABLE;
 	if (flags & EXT4_NOATIME_FL)
-		inode->i_flags |= S_NOATIME;
+		new_fl |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
-		inode->i_flags |= S_DIRSYNC;
+		new_fl |= S_DIRSYNC;
+	inode_set_flags(inode, new_fl,
+			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 }
 
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
-	unsigned int flags = ei->vfs_inode.i_flags;
-
-	ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-			EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
-	if (flags & S_SYNC)
-		ei->i_flags |= EXT4_SYNC_FL;
-	if (flags & S_APPEND)
-		ei->i_flags |= EXT4_APPEND_FL;
-	if (flags & S_IMMUTABLE)
-		ei->i_flags |= EXT4_IMMUTABLE_FL;
-	if (flags & S_NOATIME)
-		ei->i_flags |= EXT4_NOATIME_FL;
-	if (flags & S_DIRSYNC)
-		ei->i_flags |= EXT4_DIRSYNC_FL;
+	unsigned int vfs_fl;
+	unsigned long old_fl, new_fl;
+
+	do {
+		vfs_fl = ei->vfs_inode.i_flags;
+		old_fl = ei->i_flags;
+		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
+				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
+				EXT4_DIRSYNC_FL);
+		if (vfs_fl & S_SYNC)
+			new_fl |= EXT4_SYNC_FL;
+		if (vfs_fl & S_APPEND)
+			new_fl |= EXT4_APPEND_FL;
+		if (vfs_fl & S_IMMUTABLE)
+			new_fl |= EXT4_IMMUTABLE_FL;
+		if (vfs_fl & S_NOATIME)
+			new_fl |= EXT4_NOATIME_FL;
+		if (vfs_fl & S_DIRSYNC)
+			new_fl |= EXT4_DIRSYNC_FL;
+	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
+
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
-					struct ext4_inode_info *ei)
+				  struct ext4_inode_info *ei)
 {
 	blkcnt_t i_blocks ;
 	struct inode *inode = &(ei->vfs_inode);
@@ -2714,7 +4002,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 		/* we are using combined 48 bit field */
 		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
 					le32_to_cpu(raw_inode->i_blocks_lo);
-		if (ei->i_flags & EXT4_HUGE_FILE_FL) {
+		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
 			/* i_blocks represent file system block size */
 			return i_blocks  << (inode->i_blkbits - 9);
 		} else {
@@ -2725,15 +4013,30 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 	}
 }
 
+static inline void ext4_iget_extra_inode(struct inode *inode,
+					 struct ext4_inode *raw_inode,
+					 struct ext4_inode_info *ei)
+{
+	__le32 *magic = (void *)raw_inode +
+			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+		ext4_find_inline_data_nolock(inode);
+	} else
+		EXT4_I(inode)->i_inline_off = 0;
+}
+
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
 	struct ext4_iloc iloc;
 	struct ext4_inode *raw_inode;
 	struct ext4_inode_info *ei;
-	struct buffer_head *bh;
 	struct inode *inode;
+	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
 	int block;
+	uid_t i_uid;
+	gid_t i_gid;
 
 	inode = iget_locked(sb, ino);
 	if (!inode)
@@ -2742,27 +4045,58 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-	ei->i_acl = EXT4_ACL_NOT_CACHED;
-	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
-	ei->i_block_alloc_info = NULL;
+	iloc.bh = NULL;
 
 	ret = __ext4_get_inode_loc(inode, &iloc, 0);
 	if (ret < 0)
 		goto bad_inode;
-	bh = iloc.bh;
 	raw_inode = ext4_raw_inode(&iloc);
+
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+		    EXT4_INODE_SIZE(inode->i_sb)) {
+			EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
+				EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
+				EXT4_INODE_SIZE(inode->i_sb));
+			ret = -EIO;
+			goto bad_inode;
+		}
+	} else
+		ei->i_extra_isize = 0;
+
+	/* Precompute checksum seed for inode metadata */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+		__u32 csum;
+		__le32 inum = cpu_to_le32(inode->i_ino);
+		__le32 gen = raw_inode->i_generation;
+		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+				   sizeof(inum));
+		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+					      sizeof(gen));
+	}
+
+	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+		EXT4_ERROR_INODE(inode, "checksum invalid");
+		ret = -EIO;
+		goto bad_inode;
+	}
+
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
-	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
-	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
-	if(!(test_opt (inode->i_sb, NO_UID32))) {
-		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
-		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
+		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
-	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	i_uid_write(inode, i_uid);
+	i_gid_write(inode, i_gid);
+	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 
-	ei->i_state = 0;
+	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
+	ei->i_inline_off = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
@@ -2771,30 +4105,34 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	 * NeilBrown 1999oct15
 	 */
 	if (inode->i_nlink == 0) {
-		if (inode->i_mode == 0 ||
-		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+		if ((inode->i_mode == 0 ||
+		     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+		    ino != EXT4_BOOT_LOADER_INO) {
 			/* this inode is deleted */
-			brelse (bh);
 			ret = -ESTALE;
 			goto bad_inode;
 		}
 		/* The only unlinked inodes we let through here have
 		 * valid i_mode and are being read by the orphan
 		 * recovery code: that's fine, we're about to complete
-		 * the process of deleting those. */
+		 * the process of deleting those.
+		 * OR it is the EXT4_BOOT_LOADER_INO which is
+		 * not initialized on a new filesystem. */
 	}
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
 	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
-	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-	    cpu_to_le32(EXT4_OS_HURD)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
-	}
 	inode->i_size = ext4_isize(raw_inode);
 	ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+	ei->i_reserved_quota = 0;
+#endif
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_block_group = iloc.block_group;
+	ei->i_last_alloc_group = ~0;
 	/*
 	 * NOTE! The in-memory inode i_data array is in little-endian order
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -2803,40 +4141,79 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 
+	/*
+	 * Set transaction id's of transactions that have to be committed
+	 * to finish f[data]sync. We set them to currently running transaction
+	 * as we cannot be sure that the inode or some of its metadata isn't
+	 * part of the transaction - the inode could have been reclaimed and
+	 * now it is reread from disk.
+	 */
+	if (journal) {
+		transaction_t *transaction;
+		tid_t tid;
+
+		read_lock(&journal->j_state_lock);
+		if (journal->j_running_transaction)
+			transaction = journal->j_running_transaction;
+		else
+			transaction = journal->j_committing_transaction;
+		if (transaction)
+			tid = transaction->t_tid;
+		else
+			tid = journal->j_commit_sequence;
+		read_unlock(&journal->j_state_lock);
+		ei->i_sync_tid = tid;
+		ei->i_datasync_tid = tid;
+	}
+
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
-		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
-		    EXT4_INODE_SIZE(inode->i_sb)) {
-			brelse (bh);
-			ret = -EIO;
-			goto bad_inode;
-		}
 		if (ei->i_extra_isize == 0) {
 			/* The extra space is currently unused. Use it. */
 			ei->i_extra_isize = sizeof(struct ext4_inode) -
 					    EXT4_GOOD_OLD_INODE_SIZE;
 		} else {
-			__le32 *magic = (void *)raw_inode +
-					EXT4_GOOD_OLD_INODE_SIZE +
-					ei->i_extra_isize;
-			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
-				 ei->i_state |= EXT4_STATE_XATTR;
+			ext4_iget_extra_inode(inode, raw_inode, ei);
 		}
-	} else
-		ei->i_extra_isize = 0;
+	}
 
 	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
 	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
 
-	inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
-	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-			inode->i_version |=
-			(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+		inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+		if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+				inode->i_version |=
+		    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+		}
 	}
 
+	ret = 0;
+	if (ei->i_file_acl &&
+	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+				 ei->i_file_acl);
+		ret = -EIO;
+		goto bad_inode;
+	} else if (!ext4_has_inline_data(inode)) {
+		if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+			if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+			    (S_ISLNK(inode->i_mode) &&
+			     !ext4_inode_is_fast_symlink(inode))))
+				/* Validate extent which is part of inode */
+				ret = ext4_ext_check_inode(inode);
+		} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+			   (S_ISLNK(inode->i_mode) &&
+			    !ext4_inode_is_fast_symlink(inode))) {
+			/* Validate block references which are part of inode */
+			ret = ext4_ind_check_inode(inode);
+		}
+	}
+	if (ret)
+		goto bad_inode;
+
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
@@ -2845,13 +4222,16 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &ext4_dir_inode_operations;
 		inode->i_fop = &ext4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (ext4_inode_is_fast_symlink(inode))
+		if (ext4_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
-		else {
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
-	} else {
+	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		inode->i_op = &ext4_special_inode_operations;
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
@@ -2859,13 +4239,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	} else if (ino == EXT4_BOOT_LOADER_INO) {
+		make_bad_inode(inode);
+	} else {
+		ret = -EIO;
+		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+		goto bad_inode;
 	}
-	brelse (iloc.bh);
+	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
 	unlock_new_inode(inode);
 	return inode;
 
 bad_inode:
+	brelse(iloc.bh);
 	iget_failed(inode);
 	return ERR_PTR(ret);
 }
@@ -2877,46 +4264,36 @@ static int ext4_inode_blocks_set(handle_t *handle,
 	struct inode *inode = &(ei->vfs_inode);
 	u64 i_blocks = inode->i_blocks;
 	struct super_block *sb = inode->i_sb;
-	int err = 0;
 
 	if (i_blocks <= ~0U) {
 		/*
-		 * i_blocks can be represnted in a 32 bit variable
+		 * i_blocks can be represented in a 32 bit variable
 		 * as multiple of 512 bytes
 		 */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = 0;
-		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
-	} else if (i_blocks <= 0xffffffffffffULL) {
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
+		return 0;
+	}
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+		return -EFBIG;
+
+	if (i_blocks <= 0xffffffffffffULL) {
 		/*
 		 * i_blocks can be represented in a 48 bit variable
 		 * as multiple of 512 bytes
 		 */
-		err = ext4_update_rocompat_feature(handle, sb,
-					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-		if (err)
-			goto  err_out;
-		/* i_block is stored in the split  48 bit fields */
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
-		ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 	} else {
-		/*
-		 * i_blocks should be represented in a 48 bit variable
-		 * as multiple of  file system block size
-		 */
-		err = ext4_update_rocompat_feature(handle, sb,
-					    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-		if (err)
-			goto  err_out;
-		ei->i_flags |= EXT4_HUGE_FILE_FL;
+		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
 		/* i_block is stored in file system block size */
 		i_blocks = i_blocks >> (inode->i_blkbits - 9);
 		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
 		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
 	}
-err_out:
-	return err;
+	return 0;
 }
 
 /*
@@ -2933,36 +4310,42 @@ static int ext4_do_update_inode(handle_t *handle,
 	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct buffer_head *bh = iloc->bh;
+	struct super_block *sb = inode->i_sb;
 	int err = 0, rc, block;
+	int need_datasync = 0, set_large_file = 0;
+	uid_t i_uid;
+	gid_t i_gid;
+
+	spin_lock(&ei->i_raw_lock);
 
-	/* For fields not not tracking in the in-memory inode,
+	/* For fields not tracked in the in-memory inode,
 	 * initialise them to zero for new inodes. */
-	if (ei->i_state & EXT4_STATE_NEW)
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 
 	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
-	if(!(test_opt(inode->i_sb, NO_UID32))) {
-		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
-		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+	i_uid = i_uid_read(inode);
+	i_gid = i_gid_read(inode);
+	if (!(test_opt(inode->i_sb, NO_UID32))) {
+		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
 /*
  * Fix up interoperability with old kernels. Otherwise, old inodes get
  * re-used with the upper 16 bits of the uid/gid intact
  */
-		if(!ei->i_dtime) {
+		if (!ei->i_dtime) {
 			raw_inode->i_uid_high =
-				cpu_to_le16(high_16_bits(inode->i_uid));
+				cpu_to_le16(high_16_bits(i_uid));
 			raw_inode->i_gid_high =
-				cpu_to_le16(high_16_bits(inode->i_gid));
+				cpu_to_le16(high_16_bits(i_gid));
 		} else {
 			raw_inode->i_uid_high = 0;
 			raw_inode->i_gid_high = 0;
 		}
 	} else {
-		raw_inode->i_uid_low =
-			cpu_to_le16(fs_high2lowuid(inode->i_uid));
-		raw_inode->i_gid_low =
-			cpu_to_le16(fs_high2lowgid(inode->i_gid));
+		raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
 		raw_inode->i_uid_high = 0;
 		raw_inode->i_gid_high = 0;
 	}
@@ -2973,37 +4356,26 @@ static int ext4_do_update_inode(handle_t *handle,
 	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
 	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
 
-	if (ext4_inode_blocks_set(handle, raw_inode, ei))
+	if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+		spin_unlock(&ei->i_raw_lock);
 		goto out_brelse;
+	}
 	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
-	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
-	    cpu_to_le32(EXT4_OS_HURD))
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
 		raw_inode->i_file_acl_high =
 			cpu_to_le16(ei->i_file_acl >> 32);
 	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
-	ext4_isize_set(raw_inode, ei->i_disksize);
+	if (ei->i_disksize != ext4_isize(raw_inode)) {
+		ext4_isize_set(raw_inode, ei->i_disksize);
+		need_datasync = 1;
+	}
 	if (ei->i_disksize > 0x7fffffffULL) {
-		struct super_block *sb = inode->i_sb;
 		if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
 				EXT4_SB(sb)->s_es->s_rev_level ==
-				cpu_to_le32(EXT4_GOOD_OLD_REV)) {
-			/* If this is the first large file
-			 * created, add a flag to the superblock.
-			 */
-			err = ext4_journal_get_write_access(handle,
-					EXT4_SB(sb)->s_sbh);
-			if (err)
-				goto out_brelse;
-			ext4_update_dynamic_rev(sb);
-			EXT4_SET_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
-			sb->s_dirt = 1;
-			handle->h_sync = 1;
-			err = ext4_journal_dirty_metadata(handle,
-					EXT4_SB(sb)->s_sbh);
-		}
+		    cpu_to_le32(EXT4_GOOD_OLD_REV))
+			set_large_file = 1;
 	}
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -3017,26 +4389,45 @@ static int ext4_do_update_inode(handle_t *handle,
 				cpu_to_le32(new_encode_dev(inode->i_rdev));
 			raw_inode->i_block[2] = 0;
 		}
-	} else for (block = 0; block < EXT4_N_BLOCKS; block++)
-		raw_inode->i_block[block] = ei->i_data[block];
+	} else if (!ext4_has_inline_data(inode)) {
+		for (block = 0; block < EXT4_N_BLOCKS; block++)
+			raw_inode->i_block[block] = ei->i_data[block];
+	}
 
-	raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
-	if (ei->i_extra_isize) {
-		if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-			raw_inode->i_version_hi =
-			cpu_to_le32(inode->i_version >> 32);
-		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+		raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+		if (ei->i_extra_isize) {
+			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+				raw_inode->i_version_hi =
+					cpu_to_le32(inode->i_version >> 32);
+			raw_inode->i_extra_isize =
+				cpu_to_le16(ei->i_extra_isize);
+		}
 	}
 
+	ext4_inode_csum_set(inode, raw_inode, ei);
+
+	spin_unlock(&ei->i_raw_lock);
 
-	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-	rc = ext4_journal_dirty_metadata(handle, bh);
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	rc = ext4_handle_dirty_metadata(handle, NULL, bh);
 	if (!err)
 		err = rc;
-	ei->i_state &= ~EXT4_STATE_NEW;
-
+	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+	if (set_large_file) {
+		BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
+		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+		if (err)
+			goto out_brelse;
+		ext4_update_dynamic_rev(sb);
+		EXT4_SET_RO_COMPAT_FEATURE(sb,
+					   EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+		ext4_handle_sync(handle);
+		err = ext4_handle_dirty_super(handle, sb);
+	}
+	ext4_update_inode_fsync_trans(handle, inode, need_datasync);
 out_brelse:
-	brelse (bh);
+	brelse(bh);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
@@ -3046,21 +4437,20 @@ out_brelse:
  *
  * We are called from a few places:
  *
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
  *   Here, there will be no transaction running. We wait for any running
- *   trasnaction to commit.
+ *   transaction to commit.
  *
- * - Within sys_sync(), kupdate and such.
- *   We wait on commit, if tol to.
+ * - Within flush work (sys_sync(), kupdate and such).
+ *   We wait on commit, if told to.
  *
- * - Within prune_icache() (PF_MEMALLOC == true)
- *   Here we simply return.  We can't afford to block kswapd on the
- *   journal commit.
+ * - Within iput_final() -> write_inode_now()
+ *   We wait on commit, if told to.
  *
  * In all cases it is actually safe for us to return without doing anything,
  * because the inode has been copied into a raw inode buffer in
- * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
+ * writeback.
  *
  * Note that we are absolutely dependent upon all inode dirtiers doing the
  * right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3072,25 +4462,95 @@ out_brelse:
  *	stuff();
  *	inode->i_size = expr;
  *
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost.  Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost.  Plus the inode will no longer be on the
+ * superblock's dirty inode list.
  */
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	if (current->flags & PF_MEMALLOC)
+	int err;
+
+	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
 		return 0;
 
-	if (ext4_journal_current_handle()) {
-		jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
-		dump_stack();
-		return -EIO;
+	if (EXT4_SB(inode->i_sb)->s_journal) {
+		if (ext4_journal_current_handle()) {
+			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+			dump_stack();
+			return -EIO;
+		}
+
+		/*
+		 * No need to force transaction in WB_SYNC_NONE mode. Also
+		 * ext4_sync_fs() will force the commit after everything is
+		 * written.
+		 */
+		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
+			return 0;
+
+		err = ext4_force_commit(inode->i_sb);
+	} else {
+		struct ext4_iloc iloc;
+
+		err = __ext4_get_inode_loc(inode, &iloc, 0);
+		if (err)
+			return err;
+		/*
+		 * sync(2) will flush the whole buffer cache. No need to do
+		 * it here separately for each inode.
+		 */
+		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
+			sync_dirty_buffer(iloc.bh);
+		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+			EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
+					 "IO error syncing inode");
+			err = -EIO;
+		}
+		brelse(iloc.bh);
 	}
+	return err;
+}
 
-	if (!wait)
-		return 0;
+/*
+ * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
+ * buffers that are attached to a page stradding i_size and are undergoing
+ * commit. In that case we have to wait for commit to finish and try again.
+ */
+static void ext4_wait_for_tail_page_commit(struct inode *inode)
+{
+	struct page *page;
+	unsigned offset;
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	tid_t commit_tid = 0;
+	int ret;
 
-	return ext4_force_commit(inode->i_sb);
+	offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	/*
+	 * All buffers in the last page remain valid? Then there's nothing to
+	 * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
+	 * blocksize case
+	 */
+	if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+		return;
+	while (1) {
+		page = find_lock_page(inode->i_mapping,
+				      inode->i_size >> PAGE_CACHE_SHIFT);
+		if (!page)
+			return;
+		ret = __ext4_journalled_invalidatepage(page, offset,
+						PAGE_CACHE_SIZE - offset);
+		unlock_page(page);
+		page_cache_release(page);
+		if (ret != -EBUSY)
+			return;
+		commit_tid = 0;
+		read_lock(&journal->j_state_lock);
+		if (journal->j_committing_transaction)
+			commit_tid = journal->j_committing_transaction->t_tid;
+		read_unlock(&journal->j_state_lock);
+		if (commit_tid)
+			jbd2_log_wait_commit(journal, commit_tid);
+	}
 }
 
 /*
@@ -3108,31 +4568,42 @@ int ext4_write_inode(struct inode *inode, int wait)
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
  */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error, rc = 0;
+	int orphan = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
 
-	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+	if (is_quota_modification(inode, attr))
+		dquot_initialize(inode);
+	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
+	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
 		handle_t *handle;
 
 		/* (user+group)*(old+new) structure, inode write (sb,
 		 * inode block, ? - but truncate inode update has it) */
-		handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
-					EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+		handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+			(EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
+			 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+		error = dquot_transfer(inode, attr);
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
@@ -3147,45 +4618,97 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		ext4_journal_stop(handle);
 	}
 
-	if (attr->ia_valid & ATTR_SIZE) {
-		if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+		handle_t *handle;
+
+		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
-			if (attr->ia_size > sbi->s_bitmap_maxbytes) {
-				error = -EFBIG;
-				goto err_out;
-			}
+			if (attr->ia_size > sbi->s_bitmap_maxbytes)
+				return -EFBIG;
 		}
-	}
 
-	if (S_ISREG(inode->i_mode) &&
-	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
-		handle_t *handle;
+		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+			inode_inc_iversion(inode);
 
-		handle = ext4_journal_start(inode, 3);
-		if (IS_ERR(handle)) {
-			error = PTR_ERR(handle);
-			goto err_out;
-		}
+		if (S_ISREG(inode->i_mode) &&
+		    (attr->ia_size < inode->i_size)) {
+			if (ext4_should_order_data(inode)) {
+				error = ext4_begin_ordered_truncate(inode,
+							    attr->ia_size);
+				if (error)
+					goto err_out;
+			}
+			handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+			if (IS_ERR(handle)) {
+				error = PTR_ERR(handle);
+				goto err_out;
+			}
+			if (ext4_handle_valid(handle)) {
+				error = ext4_orphan_add(handle, inode);
+				orphan = 1;
+			}
+			down_write(&EXT4_I(inode)->i_data_sem);
+			EXT4_I(inode)->i_disksize = attr->ia_size;
+			rc = ext4_mark_inode_dirty(handle, inode);
+			if (!error)
+				error = rc;
+			/*
+			 * We have to update i_size under i_data_sem together
+			 * with i_disksize to avoid races with writeback code
+			 * running ext4_wb_update_i_disksize().
+			 */
+			if (!error)
+				i_size_write(inode, attr->ia_size);
+			up_write(&EXT4_I(inode)->i_data_sem);
+			ext4_journal_stop(handle);
+			if (error) {
+				ext4_orphan_del(NULL, inode);
+				goto err_out;
+			}
+		} else
+			i_size_write(inode, attr->ia_size);
 
-		error = ext4_orphan_add(handle, inode);
-		EXT4_I(inode)->i_disksize = attr->ia_size;
-		rc = ext4_mark_inode_dirty(handle, inode);
-		if (!error)
-			error = rc;
-		ext4_journal_stop(handle);
+		/*
+		 * Blocks are going to be removed from the inode. Wait
+		 * for dio in flight.  Temporarily disable
+		 * dioread_nolock to prevent livelock.
+		 */
+		if (orphan) {
+			if (!ext4_should_journal_data(inode)) {
+				ext4_inode_block_unlocked_dio(inode);
+				inode_dio_wait(inode);
+				ext4_inode_resume_unlocked_dio(inode);
+			} else
+				ext4_wait_for_tail_page_commit(inode);
+		}
+		/*
+		 * Truncate pagecache after we've waited for commit
+		 * in data=journal mode to make pages freeable.
+		 */
+			truncate_pagecache(inode, inode->i_size);
 	}
+	/*
+	 * We want to call ext4_truncate() even if attr->ia_size ==
+	 * inode->i_size for cases like truncation of fallocated space
+	 */
+	if (attr->ia_valid & ATTR_SIZE)
+		ext4_truncate(inode);
 
-	rc = inode_setattr(inode, attr);
+	if (!rc) {
+		setattr_copy(inode, attr);
+		mark_inode_dirty(inode);
+	}
 
-	/* If inode_setattr's call to ext4_truncate failed to get a
-	 * transaction handle at all, we need to clean up the in-core
-	 * orphan list manually. */
-	if (inode->i_nlink)
+	/*
+	 * If the call to ext4_truncate failed to get a transaction handle at
+	 * all, we need to clean up the in-core orphan list manually.
+	 */
+	if (orphan && inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 
 	if (!rc && (ia_valid & ATTR_MODE))
-		rc = ext4_acl_chmod(inode);
+		rc = posix_acl_chmod(inode, inode->i_mode);
 
 err_out:
 	ext4_std_error(inode->i_sb, error);
@@ -3194,67 +4717,142 @@ err_out:
 	return error;
 }
 
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode;
+	unsigned long long delalloc_blocks;
+
+	inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+
+	/*
+	 * If there is inline data in the inode, the inode will normally not
+	 * have data blocks allocated (it may have an external xattr block).
+	 * Report at least one sector for such files, so tools like tar, rsync,
+	 * others doen't incorrectly think the file is completely sparse.
+	 */
+	if (unlikely(ext4_has_inline_data(inode)))
+		stat->blocks += (stat->size + 511) >> 9;
+
+	/*
+	 * We can't update i_blocks if the block allocation is delayed
+	 * otherwise in the case of system crash before the real block
+	 * allocation is done, we will have i_blocks inconsistent with
+	 * on-disk file blocks.
+	 * We always keep i_blocks updated together with real
+	 * allocation. But to not confuse with user, stat
+	 * will return the blocks that include the delayed allocation
+	 * blocks for this file.
+	 */
+	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
+				   EXT4_I(inode)->i_reserved_data_blocks);
+	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
+	return 0;
+}
+
+static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+				   int pextents)
+{
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return ext4_ind_trans_blocks(inode, lblocks);
+	return ext4_ext_index_trans_blocks(inode, pextents);
+}
 
 /*
- * How many blocks doth make a writepage()?
- *
- * With N blocks per page, it may be:
- * N data blocks
- * 2 indirect block
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
- *
- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
- *
- * With ordered or writeback data it's the same, less the N data blocks.
- *
- * If the inode's direct blocks can hold an integral number of pages then a
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
- *
- * This still overestimates under most circumstances.  If we were to pass the
- * start and end offsets in here as well we could do block_to_path() on each
- * block and work out the exact number of indirects which are touched.  Pah.
+ * Account for index blocks, block groups bitmaps and block group
+ * descriptor blocks if modify datablocks and index blocks
+ * worse case, the indexs blocks spread over different block groups
+ *
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiguous, with flexbg,
+ * they could still across block group boundary.
+ *
+ * Also account for superblock, inode, quota and xattr blocks
  */
+static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+				  int pextents)
+{
+	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+	int gdpblocks;
+	int idxblocks;
+	int ret = 0;
+
+	/*
+	 * How many index blocks need to touch to map @lblocks logical blocks
+	 * to @pextents physical extents?
+	 */
+	idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
+
+	ret = idxblocks;
 
+	/*
+	 * Now let's see how many group bitmaps and group descriptors need
+	 * to account
+	 */
+	groups = idxblocks + pextents;
+	gdpblocks = groups;
+	if (groups > ngroups)
+		groups = ngroups;
+	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+
+	/* bitmaps and block group descriptor blocks */
+	ret += groups + gdpblocks;
+
+	/* Blocks for super block, inode, quota and xattr blocks */
+	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+
+	return ret;
+}
+
+/*
+ * Calculate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
+ *
+ * This could be called via ext4_write_begin()
+ *
+ * We need to consider the worse case, when
+ * one new block per extent.
+ */
 int ext4_writepage_trans_blocks(struct inode *inode)
 {
 	int bpp = ext4_journal_blocks_per_page(inode);
-	int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
 	int ret;
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-		return ext4_ext_writepage_trans_blocks(inode, bpp);
+	ret = ext4_meta_trans_blocks(inode, bpp, bpp);
 
+	/* Account for data blocks for journalled mode */
 	if (ext4_should_journal_data(inode))
-		ret = 3 * (bpp + indirects) + 2;
-	else
-		ret = 2 * (bpp + indirects) + 2;
-
-#ifdef CONFIG_QUOTA
-	/* We know that structure was already allocated during DQUOT_INIT so
-	 * we will be updating only the data blocks + inodes */
-	ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-
+		ret += bpp;
 	return ret;
 }
 
 /*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+	return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+
+/*
  * The caller must have previously called ext4_reserve_inode_write().
  * Give this, we know that the caller already has write access to iloc->bh.
  */
 int ext4_mark_iloc_dirty(handle_t *handle,
-		struct inode *inode, struct ext4_iloc *iloc)
+			 struct inode *inode, struct ext4_iloc *iloc)
 {
 	int err = 0;
 
-	if (test_opt(inode->i_sb, I_VERSION))
+	if (IS_I_VERSION(inode))
 		inode_inc_iversion(inode);
 
 	/* the do_update_inode consumes one bh->b_count */
@@ -3275,16 +4873,15 @@ int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 			 struct ext4_iloc *iloc)
 {
-	int err = 0;
-	if (handle) {
-		err = ext4_get_inode_loc(inode, iloc);
-		if (!err) {
-			BUFFER_TRACE(iloc->bh, "get_write_access");
-			err = ext4_journal_get_write_access(handle, iloc->bh);
-			if (err) {
-				brelse(iloc->bh);
-				iloc->bh = NULL;
-			}
+	int err;
+
+	err = ext4_get_inode_loc(inode, iloc);
+	if (!err) {
+		BUFFER_TRACE(iloc->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, iloc->bh);
+		if (err) {
+			brelse(iloc->bh);
+			iloc->bh = NULL;
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
@@ -3302,7 +4899,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
 {
 	struct ext4_inode *raw_inode;
 	struct ext4_xattr_ibody_header *header;
-	struct ext4_xattr_entry *entry;
 
 	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
 		return 0;
@@ -3310,11 +4906,10 @@ static int ext4_expand_extra_isize(struct inode *inode,
 	raw_inode = ext4_raw_inode(&iloc);
 
 	header = IHDR(inode, raw_inode);
-	entry = IFIRST(header);
 
 	/* No extended attributes present */
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
-		header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
 			new_extra_isize);
 		EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -3338,14 +4933,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
  * inode out, but prune_icache isn't a user-visible syncing function.
  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
  * we start and wait on commits.
- *
- * Is this efficient/effective?  Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O.  But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out.  One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory.  It has the desired
- * effect.
  */
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
@@ -3355,9 +4942,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 	int err, ret;
 
 	might_sleep();
+	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
-	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
-	    !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+	if (ext4_handle_valid(handle) &&
+	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
 		/*
 		 * We need extra buffer credits since we may write into EA block
 		 * with this same handle. If journal_extend fails, then it will
@@ -3371,10 +4960,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 						      sbi->s_want_extra_isize,
 						      iloc, handle);
 			if (ret) {
-				EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+				ext4_set_inode_state(inode,
+						     EXT4_STATE_NO_EXPAND);
 				if (mnt_count !=
 					le16_to_cpu(sbi->s_es->s_mnt_count)) {
-					ext4_warning(inode->i_sb, __FUNCTION__,
+					ext4_warning(inode->i_sb,
 					"Unable to expand inode %lu. Delete"
 					" some EAs or run e2fsck.",
 					inode->i_ino);
@@ -3396,31 +4986,23 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
  */
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
 {
-	handle_t *current_handle = ext4_journal_current_handle();
 	handle_t *handle;
 
-	handle = ext4_journal_start(inode, 2);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 	if (IS_ERR(handle))
 		goto out;
-	if (current_handle &&
-		current_handle->h_transaction != handle->h_transaction) {
-		/* This task has a transaction open against a different fs */
-		printk(KERN_EMERG "%s: transactions do not match!\n",
-		       __FUNCTION__);
-	} else {
-		jbd_debug(5, "marking dirty.  outer handle=%p\n",
-				current_handle);
-		ext4_mark_inode_dirty(handle, inode);
-	}
+
+	ext4_mark_inode_dirty(handle, inode);
+
 	ext4_journal_stop(handle);
 out:
 	return;
@@ -3445,8 +5027,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
 			BUFFER_TRACE(iloc.bh, "get_write_access");
 			err = jbd2_journal_get_write_access(handle, iloc.bh);
 			if (!err)
-				err = ext4_journal_dirty_metadata(handle,
-								  iloc.bh);
+				err = ext4_handle_dirty_metadata(handle,
+								 NULL,
+								 iloc.bh);
 			brelse(iloc.bh);
 		}
 	}
@@ -3472,11 +5055,27 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	 */
 
 	journal = EXT4_JOURNAL(inode);
+	if (!journal)
+		return 0;
 	if (is_journal_aborted(journal))
 		return -EROFS;
+	/* We have to allocate physical blocks for delalloc blocks
+	 * before flushing journal. otherwise delalloc blocks can not
+	 * be allocated any more. even more truncate on delalloc blocks
+	 * could trigger BUG by flushing delalloc blocks in journal.
+	 * There is no delalloc block in non-journal data mode.
+	 */
+	if (val && test_opt(inode->i_sb, DELALLOC)) {
+		err = ext4_alloc_da_blocks(inode);
+		if (err < 0)
+			return err;
+	}
+
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
 
 	jbd2_journal_lock_updates(journal);
-	jbd2_journal_flush(journal);
 
 	/*
 	 * OK, there are no updates running now, and all cached data is
@@ -3487,23 +5086,119 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	 */
 
 	if (val)
-		EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
-	else
-		EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+	else {
+		jbd2_journal_flush(journal);
+		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+	}
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
+	ext4_inode_resume_unlocked_dio(inode);
 
 	/* Finally we can mark the inode as dirty. */
 
-	handle = ext4_journal_start(inode, 1);
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
 	err = ext4_mark_inode_dirty(handle, inode);
-	handle->h_sync = 1;
+	ext4_handle_sync(handle);
 	ext4_journal_stop(handle);
 	ext4_std_error(inode->i_sb, err);
 
 	return err;
 }
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	loff_t size;
+	unsigned long len;
+	int ret;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file_inode(file);
+	struct address_space *mapping = inode->i_mapping;
+	handle_t *handle;
+	get_block_t *get_block;
+	int retries = 0;
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+	/* Delalloc case is easy... */
+	if (test_opt(inode->i_sb, DELALLOC) &&
+	    !ext4_should_journal_data(inode) &&
+	    !ext4_nonda_switch(inode->i_sb)) {
+		do {
+			ret = __block_page_mkwrite(vma, vmf,
+						   ext4_da_get_block_prep);
+		} while (ret == -ENOSPC &&
+		       ext4_should_retry_alloc(inode->i_sb, &retries));
+		goto out_ret;
+	}
+
+	lock_page(page);
+	size = i_size_read(inode);
+	/* Page got truncated from under us? */
+	if (page->mapping != mapping || page_offset(page) > size) {
+		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+	/*
+	 * Return if we have all the buffers mapped. This avoids the need to do
+	 * journal_start/journal_stop which can block and take a long time
+	 */
+	if (page_has_buffers(page)) {
+		if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+					    0, len, NULL,
+					    ext4_bh_unmapped)) {
+			/* Wait so that we don't change page under IO */
+			wait_for_stable_page(page);
+			ret = VM_FAULT_LOCKED;
+			goto out;
+		}
+	}
+	unlock_page(page);
+	/* OK, we need to fill the hole... */
+	if (ext4_should_dioread_nolock(inode))
+		get_block = ext4_get_block_write;
+	else
+		get_block = ext4_get_block;
+retry_alloc:
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+				    ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+	ret = __block_page_mkwrite(vma, vmf, get_block);
+	if (!ret && ext4_should_journal_data(inode)) {
+		if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
+			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+			unlock_page(page);
+			ret = VM_FAULT_SIGBUS;
+			ext4_journal_stop(handle);
+			goto out;
+		}
+		ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+	}
+	ext4_journal_stop(handle);
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry_alloc;
+out_ret:
+	ret = block_page_mkwrite_return(ret);
+out:
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2ed7c37f897..0f2252ec274 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -10,21 +10,202 @@
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/capability.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/time.h>
 #include <linux/compat.h>
-#include <linux/smp_lock.h>
+#include <linux/mount.h>
+#include <linux/file.h>
 #include <asm/uaccess.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 
-int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
-		unsigned long arg)
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
+
+/**
+ * Swap memory between @a and @b for @len bytes.
+ *
+ * @a:          pointer to first memory area
+ * @b:          pointer to second memory area
+ * @len:        number of bytes to swap
+ *
+ */
+static void memswap(void *a, void *b, size_t len)
+{
+	unsigned char *ap, *bp;
+	unsigned char tmp;
+
+	ap = (unsigned char *)a;
+	bp = (unsigned char *)b;
+	while (len-- > 0) {
+		tmp = *ap;
+		*ap = *bp;
+		*bp = tmp;
+		ap++;
+		bp++;
+	}
+}
+
+/**
+ * Swap i_data and associated attributes between @inode1 and @inode2.
+ * This function is used for the primary swap between inode1 and inode2
+ * and also to revert this primary swap in case of errors.
+ *
+ * Therefore you have to make sure, that calling this method twice
+ * will revert all changes.
+ *
+ * @inode1:     pointer to first inode
+ * @inode2:     pointer to second inode
+ */
+static void swap_inode_data(struct inode *inode1, struct inode *inode2)
+{
+	loff_t isize;
+	struct ext4_inode_info *ei1;
+	struct ext4_inode_info *ei2;
+
+	ei1 = EXT4_I(inode1);
+	ei2 = EXT4_I(inode2);
+
+	memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
+	memswap(&inode1->i_version, &inode2->i_version,
+		  sizeof(inode1->i_version));
+	memswap(&inode1->i_blocks, &inode2->i_blocks,
+		  sizeof(inode1->i_blocks));
+	memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
+	memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
+	memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
+
+	memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
+	memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
+	memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
+	ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
+	ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
+	ext4_es_lru_del(inode1);
+	ext4_es_lru_del(inode2);
+
+	isize = i_size_read(inode1);
+	i_size_write(inode1, i_size_read(inode2));
+	i_size_write(inode2, isize);
+}
+
+/**
+ * Swap the information from the given @inode and the inode
+ * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
+ * important fields of the inodes.
+ *
+ * @sb:         the super block of the filesystem
+ * @inode:      the inode to swap with EXT4_BOOT_LOADER_INO
+ *
+ */
+static long swap_inode_boot_loader(struct super_block *sb,
+				struct inode *inode)
+{
+	handle_t *handle;
+	int err;
+	struct inode *inode_bl;
+	struct ext4_inode_info *ei_bl;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+	if (IS_ERR(inode_bl))
+		return PTR_ERR(inode_bl);
+	ei_bl = EXT4_I(inode_bl);
+
+	filemap_flush(inode->i_mapping);
+	filemap_flush(inode_bl->i_mapping);
+
+	/* Protect orig inodes against a truncate and make sure,
+	 * that only 1 swap_inode_boot_loader is running. */
+	lock_two_nondirectories(inode, inode_bl);
+
+	truncate_inode_pages(&inode->i_data, 0);
+	truncate_inode_pages(&inode_bl->i_data, 0);
+
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(inode);
+	ext4_inode_block_unlocked_dio(inode_bl);
+	inode_dio_wait(inode);
+	inode_dio_wait(inode_bl);
+
+	handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
+	if (IS_ERR(handle)) {
+		err = -EINVAL;
+		goto journal_err_out;
+	}
+
+	/* Protect extent tree against block allocations via delalloc */
+	ext4_double_down_write_data_sem(inode, inode_bl);
+
+	if (inode_bl->i_nlink == 0) {
+		/* this inode has never been used as a BOOT_LOADER */
+		set_nlink(inode_bl, 1);
+		i_uid_write(inode_bl, 0);
+		i_gid_write(inode_bl, 0);
+		inode_bl->i_flags = 0;
+		ei_bl->i_flags = 0;
+		inode_bl->i_version = 1;
+		i_size_write(inode_bl, 0);
+		inode_bl->i_mode = S_IFREG;
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+			ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
+			ext4_ext_tree_init(handle, inode_bl);
+		} else
+			memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
+	}
+
+	swap_inode_data(inode, inode_bl);
+
+	inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	inode_bl->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+
+	ext4_discard_preallocations(inode);
+
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err < 0) {
+		ext4_warning(inode->i_sb,
+			"couldn't mark inode #%lu dirty (err %d)",
+			inode->i_ino, err);
+		/* Revert all changes: */
+		swap_inode_data(inode, inode_bl);
+	} else {
+		err = ext4_mark_inode_dirty(handle, inode_bl);
+		if (err < 0) {
+			ext4_warning(inode_bl->i_sb,
+				"couldn't mark inode #%lu dirty (err %d)",
+				inode_bl->i_ino, err);
+			/* Revert all changes: */
+			swap_inode_data(inode, inode_bl);
+			ext4_mark_inode_dirty(handle, inode);
+		}
+	}
+	ext4_journal_stop(handle);
+	ext4_double_up_write_data_sem(inode, inode_bl);
+
+journal_err_out:
+	ext4_inode_resume_unlocked_dio(inode);
+	ext4_inode_resume_unlocked_dio(inode_bl);
+	unlock_two_nondirectories(inode, inode_bl);
+	iput(inode_bl);
+	return err;
+}
+
+long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int flags;
-	unsigned short rsv_window_size;
 
-	ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+	ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
 
 	switch (cmd) {
 	case EXT4_IOC_GETFLAGS:
@@ -33,29 +214,29 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
 		handle_t *handle = NULL;
-		int err;
+		int err, migrate = 0;
 		struct ext4_iloc iloc;
-		unsigned int oldflags;
+		unsigned int oldflags, mask, i;
 		unsigned int jflag;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
-
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
 			return -EFAULT;
 
-		if (!S_ISDIR(inode->i_mode))
-			flags &= ~EXT4_DIRSYNC_FL;
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+
+		flags = ext4_mask_flags(inode->i_mode, flags);
 
+		err = -EPERM;
 		mutex_lock(&inode->i_mutex);
 		/* Is it quota file? Do not allow user to mess with it */
-		if (IS_NOQUOTA(inode)) {
-			mutex_unlock(&inode->i_mutex);
-			return -EPERM;
-		}
+		if (IS_NOQUOTA(inode))
+			goto flags_out;
+
 		oldflags = ei->i_flags;
 
 		/* The JOURNAL_DATA flag is modifiable only by root */
@@ -68,10 +249,8 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * This test looks nicer. Thanks to Pauline Middelink
 		 */
 		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
-				return -EPERM;
-			}
+			if (!capable(CAP_LINUX_IMMUTABLE))
+				goto flags_out;
 		}
 
 		/*
@@ -79,27 +258,40 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE)) {
-				mutex_unlock(&inode->i_mutex);
-				return -EPERM;
-			}
+			if (!capable(CAP_SYS_RESOURCE))
+				goto flags_out;
 		}
+		if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+			migrate = 1;
+
+		if (flags & EXT4_EOFBLOCKS_FL) {
+			/* we don't support adding EOFBLOCKS flag */
+			if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+				err = -EOPNOTSUPP;
+				goto flags_out;
+			}
+		} else if (oldflags & EXT4_EOFBLOCKS_FL)
+			ext4_truncate(inode);
 
-
-		handle = ext4_journal_start(inode, 1);
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 		if (IS_ERR(handle)) {
-			mutex_unlock(&inode->i_mutex);
-			return PTR_ERR(handle);
+			err = PTR_ERR(handle);
+			goto flags_out;
 		}
 		if (IS_SYNC(inode))
-			handle->h_sync = 1;
+			ext4_handle_sync(handle);
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err)
 			goto flags_err;
 
-		flags = flags & EXT4_FL_USER_MODIFIABLE;
-		flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
-		ei->i_flags = flags;
+		for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+			if (!(mask & EXT4_FL_USER_MODIFIABLE))
+				continue;
+			if (mask & flags)
+				ext4_set_inode_flag(inode, i);
+			else
+				ext4_clear_inode_flag(inode, i);
+		}
 
 		ext4_set_inode_flags(inode);
 		inode->i_ctime = ext4_current_time(inode);
@@ -107,14 +299,23 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
 		ext4_journal_stop(handle);
-		if (err) {
-			mutex_unlock(&inode->i_mutex);
-			return err;
-		}
+		if (err)
+			goto flags_out;
 
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
 			err = ext4_change_inode_journal_flag(inode, jflag);
+		if (err)
+			goto flags_out;
+		if (migrate) {
+			if (flags & EXT4_EXTENTS_FL)
+				err = ext4_ext_migrate(inode);
+			else
+				err = ext4_ind_migrate(inode);
+		}
+
+flags_out:
 		mutex_unlock(&inode->i_mutex);
+		mnt_drop_write_file(filp);
 		return err;
 	}
 	case EXT4_IOC_GETVERSION:
@@ -127,16 +328,30 @@ flags_err:
 		__u32 generation;
 		int err;
 
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EPERM;
-		if (IS_RDONLY(inode))
-			return -EROFS;
-		if (get_user(generation, (int __user *) arg))
-			return -EFAULT;
 
-		handle = ext4_journal_start(inode, 1);
-		if (IS_ERR(handle))
-			return PTR_ERR(handle);
+		if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+			ext4_warning(sb, "Setting inode version is not "
+				     "supported with metadata_csum enabled.");
+			return -ENOTTY;
+		}
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+		if (get_user(generation, (int __user *) arg)) {
+			err = -EFAULT;
+			goto setversion_out;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto unlock_out;
+		}
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
 			inode->i_ctime = ext4_current_time(inode);
@@ -144,118 +359,257 @@ flags_err:
 			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 		}
 		ext4_journal_stop(handle);
+
+unlock_out:
+		mutex_unlock(&inode->i_mutex);
+setversion_out:
+		mnt_drop_write_file(filp);
 		return err;
 	}
-#ifdef CONFIG_JBD2_DEBUG
-	case EXT4_IOC_WAIT_FOR_READONLY:
-		/*
-		 * This is racy - by the time we're woken up and running,
-		 * the superblock could be released.  And the module could
-		 * have been unloaded.  So sue me.
-		 *
-		 * Returns 1 if it slept, else zero.
-		 */
-		{
-			struct super_block *sb = inode->i_sb;
-			DECLARE_WAITQUEUE(wait, current);
-			int ret = 0;
-
-			set_current_state(TASK_INTERRUPTIBLE);
-			add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
-			if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
-				schedule();
-				ret = 1;
-			}
-			remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
-			return ret;
+	case EXT4_IOC_GROUP_EXTEND: {
+		ext4_fsblk_t n_blocks_count;
+		int err, err2=0;
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+			err = -EFAULT;
+			goto group_extend_out;
 		}
-#endif
-	case EXT4_IOC_GETRSVSZ:
-		if (test_opt(inode->i_sb, RESERVATION)
-			&& S_ISREG(inode->i_mode)
-			&& ei->i_block_alloc_info) {
-			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
-			return put_user(rsv_window_size, (int __user *)arg);
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not supported with bigalloc");
+			err = -EOPNOTSUPP;
+			goto group_extend_out;
 		}
-		return -ENOTTY;
-	case EXT4_IOC_SETRSVSZ: {
 
-		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
-			return -ENOTTY;
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto group_extend_out;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
+		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write_file(filp);
+group_extend_out:
+		ext4_resize_end(sb);
+		return err;
+	}
 
-		if (!is_owner_or_cap(inode))
-			return -EACCES;
+	case EXT4_IOC_MOVE_EXT: {
+		struct move_extent me;
+		struct fd donor;
+		int err;
 
-		if (get_user(rsv_window_size, (int __user *)arg))
+		if (!(filp->f_mode & FMODE_READ) ||
+		    !(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&me,
+			(struct move_extent __user *)arg, sizeof(me)))
 			return -EFAULT;
+		me.moved_len = 0;
 
-		if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
-			rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
+		donor = fdget(me.donor_fd);
+		if (!donor.file)
+			return -EBADF;
 
-		/*
-		 * need to allocate reservation structure for this inode
-		 * before set the window size
-		 */
-		down_write(&ei->i_data_sem);
-		if (!ei->i_block_alloc_info)
-			ext4_init_block_alloc_info(inode);
+		if (!(donor.file->f_mode & FMODE_WRITE)) {
+			err = -EBADF;
+			goto mext_out;
+		}
 
-		if (ei->i_block_alloc_info){
-			struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
-			rsv->rsv_goal_size = rsv_window_size;
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online defrag not supported with bigalloc");
+			err = -EOPNOTSUPP;
+			goto mext_out;
 		}
-		up_write(&ei->i_data_sem);
-		return 0;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto mext_out;
+
+		err = ext4_move_extents(filp, donor.file, me.orig_start,
+					me.donor_start, me.len, &me.moved_len);
+		mnt_drop_write_file(filp);
+
+		if (copy_to_user((struct move_extent __user *)arg,
+				 &me, sizeof(me)))
+			err = -EFAULT;
+mext_out:
+		fdput(donor);
+		return err;
 	}
-	case EXT4_IOC_GROUP_EXTEND: {
-		ext4_fsblk_t n_blocks_count;
-		struct super_block *sb = inode->i_sb;
-		int err;
 
-		if (!capable(CAP_SYS_RESOURCE))
-			return -EPERM;
+	case EXT4_IOC_GROUP_ADD: {
+		struct ext4_new_group_data input;
+		int err, err2=0;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
 
-		if (get_user(n_blocks_count, (__u32 __user *)arg))
-			return -EFAULT;
+		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
+				sizeof(input))) {
+			err = -EFAULT;
+			goto group_add_out;
+		}
 
-		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
-		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
-		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not supported with bigalloc");
+			err = -EOPNOTSUPP;
+			goto group_add_out;
+		}
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto group_add_out;
 
+		err = ext4_group_add(sb, &input);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write_file(filp);
+		if (!err && ext4_has_group_desc_csum(sb) &&
+		    test_opt(sb, INIT_INODE_TABLE))
+			err = ext4_register_li_request(sb, input.group);
+group_add_out:
+		ext4_resize_end(sb);
 		return err;
 	}
-	case EXT4_IOC_GROUP_ADD: {
-		struct ext4_new_group_data input;
-		struct super_block *sb = inode->i_sb;
+
+	case EXT4_IOC_MIGRATE:
+	{
 		int err;
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
 
-		if (!capable(CAP_SYS_RESOURCE))
-			return -EPERM;
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+		/*
+		 * inode_mutex prevent write and truncate on the file.
+		 * Read still goes through. We take i_data_sem in
+		 * ext4_ext_swap_inode_data before we switch the
+		 * inode format to prevent read.
+		 */
+		mutex_lock(&(inode->i_mutex));
+		err = ext4_ext_migrate(inode);
+		mutex_unlock(&(inode->i_mutex));
+		mnt_drop_write_file(filp);
+		return err;
+	}
+
+	case EXT4_IOC_ALLOC_DA_BLKS:
+	{
+		int err;
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+		err = ext4_alloc_da_blocks(inode);
+		mnt_drop_write_file(filp);
+		return err;
+	}
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
+	case EXT4_IOC_SWAP_BOOT:
+		if (!(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+		return swap_inode_boot_loader(sb, inode);
 
-		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
-				sizeof(input)))
+	case EXT4_IOC_RESIZE_FS: {
+		ext4_fsblk_t n_blocks_count;
+		int err = 0, err2 = 0;
+		ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online resizing not (yet) supported with bigalloc");
+			return -EOPNOTSUPP;
+		}
+
+		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+				   sizeof(__u64))) {
 			return -EFAULT;
+		}
 
-		err = ext4_group_add(sb, &input);
-		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
-		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
 
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto resizefs_out;
+
+		err = ext4_resize_fs(sb, n_blocks_count);
+		if (EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+		mnt_drop_write_file(filp);
+		if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+		    ext4_has_group_desc_csum(sb) &&
+		    test_opt(sb, INIT_INODE_TABLE))
+			err = ext4_register_li_request(sb, o_group);
+
+resizefs_out:
+		ext4_resize_end(sb);
 		return err;
 	}
 
-	case EXT4_IOC_MIGRATE:
-		return ext4_ext_migrate(inode, filp, cmd, arg);
+	case FITRIM:
+	{
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (!blk_queue_discard(q))
+			return -EOPNOTSUPP;
+
+		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
+		    sizeof(range)))
+			return -EFAULT;
+
+		range.minlen = max((unsigned int)range.minlen,
+				   q->limits.discard_granularity);
+		ret = ext4_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range __user *)arg, &range,
+		    sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+	case EXT4_IOC_PRECACHE_EXTENTS:
+		return ext4_ext_precache(inode);
 
 	default:
 		return -ENOTTY;
@@ -265,9 +619,6 @@ flags_err:
 #ifdef CONFIG_COMPAT
 long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
-
 	/* These are just misnamed, they actually get/put from/to user an int */
 	switch (cmd) {
 	case EXT4_IOC32_GETFLAGS:
@@ -291,25 +642,43 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC32_SETVERSION_OLD:
 		cmd = EXT4_IOC_SETVERSION_OLD;
 		break;
-#ifdef CONFIG_JBD2_DEBUG
-	case EXT4_IOC32_WAIT_FOR_READONLY:
-		cmd = EXT4_IOC_WAIT_FOR_READONLY;
-		break;
-#endif
 	case EXT4_IOC32_GETRSVSZ:
 		cmd = EXT4_IOC_GETRSVSZ;
 		break;
 	case EXT4_IOC32_SETRSVSZ:
 		cmd = EXT4_IOC_SETRSVSZ;
 		break;
-	case EXT4_IOC_GROUP_ADD:
+	case EXT4_IOC32_GROUP_ADD: {
+		struct compat_ext4_new_group_input __user *uinput;
+		struct ext4_new_group_input input;
+		mm_segment_t old_fs;
+		int err;
+
+		uinput = compat_ptr(arg);
+		err = get_user(input.group, &uinput->group);
+		err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+		err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+		err |= get_user(input.inode_table, &uinput->inode_table);
+		err |= get_user(input.blocks_count, &uinput->blocks_count);
+		err |= get_user(input.reserved_blocks,
+				&uinput->reserved_blocks);
+		if (err)
+			return -EFAULT;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+				 (unsigned long) &input);
+		set_fs(old_fs);
+		return err;
+	}
+	case EXT4_IOC_MOVE_EXT:
+	case FITRIM:
+	case EXT4_IOC_RESIZE_FS:
+	case EXT4_IOC_PRECACHE_EXTENTS:
 		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
-	lock_kernel();
-	ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-	unlock_kernel();
-	return ret;
+	return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
 }
 #endif
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ef97f19c2f9..2dcb936be90 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -21,20 +21,19 @@
  * mballoc.c contains the multiblocks allocation routines
  */
 
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/namei.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/quotaops.h>
-#include <linux/buffer_head.h>
+#include "ext4_jbd2.h"
+#include "mballoc.h"
+#include <linux/log2.h>
 #include <linux/module.h>
-#include <linux/swap.h>
-#include <linux/proc_fs.h>
-#include <linux/pagemap.h>
-#include <linux/seq_file.h>
-#include <linux/version.h>
-#include "group.h"
+#include <linux/slab.h>
+#include <trace/events/ext4.h>
+
+#ifdef CONFIG_EXT4_DEBUG
+ushort ext4_mballoc_debug __read_mostly;
+
+module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
+MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
+#endif
 
 /*
  * MUSTDO:
@@ -60,32 +59,33 @@
  * The allocation request involve request for multiple number of blocks
  * near to the goal(block) value specified.
  *
- * During initialization phase of the allocator we decide to use the group
- * preallocation or inode preallocation depending on the size file. The
- * size of the file could be the resulting file size we would have after
- * allocation or the current file size which ever is larger. If the size is
- * less that sbi->s_mb_stream_request we select the group
- * preallocation. The default value of s_mb_stream_request is 16
- * blocks. This can also be tuned via
- * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
- * of number of blocks.
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
  *
  * The main motivation for having small file use group preallocation is to
- * ensure that we have small file closer in the disk.
+ * ensure that we have small files closer together on the disk.
  *
- * First stage the allocator looks at the inode prealloc list
- * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
- * this particular inode. The inode prealloc space is represented as:
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
  *
  * pa_lstart -> the logical start block for this prealloc space
  * pa_pstart -> the physical start block for this prealloc space
- * pa_len    -> lenght for this prealloc space
- * pa_free   ->  free space available in this prealloc space
+ * pa_len    -> length for this prealloc space (in clusters)
+ * pa_free   ->  free space available in this prealloc space (in clusters)
  *
  * The inode preallocation space is used looking at the _logical_ start
  * block. If only the logical file block falls within the range of prealloc
- * space we will consume the particular prealloc space. This make sure that
- * that the we have contiguous physical blocks representing the file blocks
+ * space we will consume the particular prealloc space. This makes sure that
+ * we have contiguous physical blocks representing the file blocks
  *
  * The important thing to be noted in case of inode prealloc space is that
  * we don't modify the values associated to inode prealloc space except
@@ -93,7 +93,7 @@
  *
  * If we are not able to find blocks in the inode prealloc space and if we
  * have the group allocation flag set then we look at the locality group
- * prealloc space. These are per CPU prealloc list repreasented as
+ * prealloc space. These are per CPU prealloc list represented as
  *
  * ext4_sb_info.s_locality_groups[smp_processor_id()]
  *
@@ -101,7 +101,7 @@
  * between CPUs. It is possible to get scheduled at this point.
  *
  * The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
  *
  * If we can't allocate blocks via inode prealloc or/and locality group
  * prealloc then we look at the buddy cache. The buddy cache is represented
@@ -114,7 +114,7 @@
  * inode as:
  *
  *  {                        page                        }
- *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  *
  *
  * one block each for bitmap and buddy information.  So for each group we
@@ -135,33 +135,35 @@
  * list. In case of inode preallocation we follow a list of heuristics
  * based on file size. This can be found in ext4_mb_normalize_request. If
  * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
+ * dependent on the cluster size; for non-bigalloc file systems, it is
  * 512 blocks. This can be tuned via
- * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
  * terms of number of blocks. If we have mounted the file system with -O
  * stripe=<value> option the group prealloc request is normalized to the
- * stripe value (sbi->s_stripe)
+ * the smallest multiple of the stripe value (sbi->s_stripe) which is
+ * greater than the default mb_group_prealloc.
  *
- * The regular allocator(using the buddy cache) support few tunables.
+ * The regular allocator (using the buddy cache) supports a few tunables.
  *
- * /proc/fs/ext4/<partition>/min_to_scan
- * /proc/fs/ext4/<partition>/max_to_scan
- * /proc/fs/ext4/<partition>/order2_req
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
  *
- * The regular allocator use buddy scan only if the request len is power of
+ * The regular allocator uses buddy scan only if the request len is power of
  * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
  * value of s_mb_order2_reqs can be tuned via
- * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
- * stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setup. If
- * not we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
  * min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
  * best extent in the found extents. Searching for the blocks starts with
  * the group specified as the goal value in allocation context via
  * ac_g_ex. Each group is first checked based on the criteria whether it
- * can used for allocation. ext4_mb_good_group explains how the groups are
+ * can be used for allocation. ext4_mb_good_group explains how the groups are
  * checked.
  *
  * Both the prealloc space are getting populated as above. So for the first
@@ -344,288 +346,28 @@
  *        object
  *
  */
-
-/*
- * with AGGRESSIVE_CHECK allocator runs consistency checks over
- * structures. these checks slow things down a lot
- */
-#define AGGRESSIVE_CHECK__
-
-/*
- * with DOUBLE_CHECK defined mballoc creates persistent in-core
- * bitmaps, maintains and uses them to check for double allocations
- */
-#define DOUBLE_CHECK__
-
-/*
- */
-#define MB_DEBUG__
-#ifdef MB_DEBUG
-#define mb_debug(fmt, a...)	printk(fmt, ##a)
-#else
-#define mb_debug(fmt, a...)
-#endif
-
-/*
- * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
- * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
- */
-#define EXT4_MB_HISTORY
-#define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
-#define EXT4_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */
-#define EXT4_MB_HISTORY_DISCARD		4	/* preallocation discarded */
-#define EXT4_MB_HISTORY_FREE		8	/* free */
-
-#define EXT4_MB_HISTORY_DEFAULT		(EXT4_MB_HISTORY_ALLOC | \
-					 EXT4_MB_HISTORY_PREALLOC)
-
-/*
- * How long mballoc can look for a best extent (in found extents)
- */
-#define MB_DEFAULT_MAX_TO_SCAN		200
-
-/*
- * How long mballoc must look for a best extent
- */
-#define MB_DEFAULT_MIN_TO_SCAN		10
-
-/*
- * How many groups mballoc will scan looking for the best chunk
- */
-#define MB_DEFAULT_MAX_GROUPS_TO_SCAN	5
-
-/*
- * with 'ext4_mb_stats' allocator will collect stats that will be
- * shown at umount. The collecting costs though!
- */
-#define MB_DEFAULT_STATS		1
-
-/*
- * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
- * by the stream allocator, which purpose is to pack requests
- * as close each to other as possible to produce smooth I/O traffic
- * We use locality group prealloc space for stream request.
- * We can tune the same via /proc/fs/ext4/<parition>/stream_req
- */
-#define MB_DEFAULT_STREAM_THRESHOLD	16	/* 64K */
-
-/*
- * for which requests use 2^N search using buddies
- */
-#define MB_DEFAULT_ORDER2_REQS		2
-
-/*
- * default group prealloc size 512 blocks
- */
-#define MB_DEFAULT_GROUP_PREALLOC	512
-
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
-
-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS	30
-
-struct ext4_free_metadata {
-	ext4_group_t group;
-	unsigned short num;
-	ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
-	struct list_head list;
-};
-
-struct ext4_group_info {
-	unsigned long	bb_state;
-	unsigned long	bb_tid;
-	struct ext4_free_metadata *bb_md_cur;
-	unsigned short	bb_first_free;
-	unsigned short	bb_free;
-	unsigned short	bb_fragments;
-	struct		list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
-	void		*bb_bitmap;
-#endif
-	unsigned short	bb_counters[];
-};
-
-#define EXT4_GROUP_INFO_NEED_INIT_BIT	0
-#define EXT4_GROUP_INFO_LOCKED_BIT	1
-
-#define EXT4_MB_GRP_NEED_INIT(grp)	\
-	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-
-
-struct ext4_prealloc_space {
-	struct list_head	pa_inode_list;
-	struct list_head	pa_group_list;
-	union {
-		struct list_head pa_tmp_list;
-		struct rcu_head	pa_rcu;
-	} u;
-	spinlock_t		pa_lock;
-	atomic_t		pa_count;
-	unsigned		pa_deleted;
-	ext4_fsblk_t		pa_pstart;	/* phys. block */
-	ext4_lblk_t		pa_lstart;	/* log. block */
-	unsigned short		pa_len;		/* len of preallocated chunk */
-	unsigned short		pa_free;	/* how many blocks are free */
-	unsigned short		pa_linear;	/* consumed in one direction
-						 * strictly, for grp prealloc */
-	spinlock_t		*pa_obj_lock;
-	struct inode		*pa_inode;	/* hack, for history only */
-};
-
-
-struct ext4_free_extent {
-	ext4_lblk_t fe_logical;
-	ext4_grpblk_t fe_start;
-	ext4_group_t fe_group;
-	int fe_len;
+static struct kmem_cache *ext4_free_data_cachep;
+
+/* We create slab caches for groupinfo data structures based on the
+ * superblock block size.  There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+#define NR_GRPINFO_CACHES 8
+static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
 };
 
-/*
- * Locality group:
- *   we try to group all related changes together
- *   so that writeback can flush/allocate them together as well
- */
-struct ext4_locality_group {
-	/* for allocator */
-	struct mutex		lg_mutex;	/* to serialize allocates */
-	struct list_head	lg_prealloc_list;/* list of preallocations */
-	spinlock_t		lg_prealloc_lock;
-};
-
-struct ext4_allocation_context {
-	struct inode *ac_inode;
-	struct super_block *ac_sb;
-
-	/* original request */
-	struct ext4_free_extent ac_o_ex;
-
-	/* goal request (after normalization) */
-	struct ext4_free_extent ac_g_ex;
-
-	/* the best found extent */
-	struct ext4_free_extent ac_b_ex;
-
-	/* copy of the bext found extent taken before preallocation efforts */
-	struct ext4_free_extent ac_f_ex;
-
-	/* number of iterations done. we have to track to limit searching */
-	unsigned long ac_ex_scanned;
-	__u16 ac_groups_scanned;
-	__u16 ac_found;
-	__u16 ac_tail;
-	__u16 ac_buddy;
-	__u16 ac_flags;		/* allocation hints */
-	__u8 ac_status;
-	__u8 ac_criteria;
-	__u8 ac_repeats;
-	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
-				 * N > 0, the field stores N, otherwise 0 */
-	__u8 ac_op;		/* operation, for history only */
-	struct page *ac_bitmap_page;
-	struct page *ac_buddy_page;
-	struct ext4_prealloc_space *ac_pa;
-	struct ext4_locality_group *ac_lg;
-};
-
-#define AC_STATUS_CONTINUE	1
-#define AC_STATUS_FOUND		2
-#define AC_STATUS_BREAK		3
-
-struct ext4_mb_history {
-	struct ext4_free_extent orig;	/* orig allocation */
-	struct ext4_free_extent goal;	/* goal allocation */
-	struct ext4_free_extent result;	/* result allocation */
-	unsigned pid;
-	unsigned ino;
-	__u16 found;	/* how many extents have been found */
-	__u16 groups;	/* how many groups have been scanned */
-	__u16 tail;	/* what tail broke some buddy */
-	__u16 buddy;	/* buddy the tail ^^^ broke */
-	__u16 flags;
-	__u8 cr:3;	/* which phase the result extent was found at */
-	__u8 op:4;
-	__u8 merged:1;
-};
-
-struct ext4_buddy {
-	struct page *bd_buddy_page;
-	void *bd_buddy;
-	struct page *bd_bitmap_page;
-	void *bd_bitmap;
-	struct ext4_group_info *bd_info;
-	struct super_block *bd_sb;
-	__u16 bd_blkbits;
-	ext4_group_t bd_group;
-};
-#define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
-#define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
-
-#ifndef EXT4_MB_HISTORY
-static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
-	return;
-}
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
-#endif
-
-#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
-
-static struct proc_dir_entry *proc_root_ext4;
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, unsigned long *count, int *errp);
-
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-					struct ext4_buddy *e4b, sector_t block,
-					int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
-			struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-
-
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline void ext4_unlock_group(struct super_block *sb,
-					ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline int ext4_is_group_locked(struct super_block *sb,
-					ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-						&(grinfo->bb_state));
-}
-
-static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
-					struct ext4_free_extent *fex)
-{
-	ext4_fsblk_t block;
-
-	block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-			+ fex->fe_start
-			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-	return block;
-}
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group);
+static void ext4_free_data_callback(struct super_block *sb,
+				struct ext4_journal_cb_entry *jce, int rc);
 
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -657,49 +399,49 @@ static inline void mb_set_bit(int bit, void *addr)
 	ext4_set_bit(bit, addr);
 }
 
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-	addr = mb_correct_addr_and_bit(&bit, addr);
-	ext4_set_bit_atomic(lock, bit, addr);
-}
-
 static inline void mb_clear_bit(int bit, void *addr)
 {
 	addr = mb_correct_addr_and_bit(&bit, addr);
 	ext4_clear_bit(bit, addr);
 }
 
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
+static inline int mb_test_and_clear_bit(int bit, void *addr)
 {
 	addr = mb_correct_addr_and_bit(&bit, addr);
-	ext4_clear_bit_atomic(lock, bit, addr);
+	return ext4_test_and_clear_bit(bit, addr);
 }
 
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
-	int fix = 0;
+	int fix = 0, ret, tmpmax;
 	addr = mb_correct_addr_and_bit(&fix, addr);
-	max += fix;
+	tmpmax = max + fix;
 	start += fix;
 
-	return ext4_find_next_zero_bit(addr, max, start) - fix;
+	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
 }
 
 static inline int mb_find_next_bit(void *addr, int max, int start)
 {
-	int fix = 0;
+	int fix = 0, ret, tmpmax;
 	addr = mb_correct_addr_and_bit(&fix, addr);
-	max += fix;
+	tmpmax = max + fix;
 	start += fix;
 
-	return ext4_find_next_bit(addr, max, start) - fix;
+	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
 }
 
 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 {
 	char *bb;
 
-	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 	BUG_ON(max == NULL);
 
 	if (order > e4b->bd_blkbits + 1) {
@@ -708,11 +450,12 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 	}
 
 	/* at order 0 we see each particular block */
-	*max = 1 << (e4b->bd_blkbits + 3);
-	if (order == 0)
-		return EXT4_MB_BITMAP(e4b);
+	if (order == 0) {
+		*max = 1 << (e4b->bd_blkbits + 3);
+		return e4b->bd_bitmap;
+	}
 
-	bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+	bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
 	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
 
 	return bb;
@@ -727,19 +470,19 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 
 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 		return;
-	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 	for (i = 0; i < count; i++) {
 		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 			ext4_fsblk_t blocknr;
-			blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
-			blocknr += first + i;
-			blocknr +=
-			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
-			ext4_error(sb, __FUNCTION__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %lu)\n",
-				   inode ? inode->i_ino : 0, blocknr,
-				   first + i, e4b->bd_group);
+
+			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+			blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
+			ext4_grp_locked_error(sb, e4b->bd_group,
+					      inode ? inode->i_ino : 0,
+					      blocknr,
+					      "freeing block already freed "
+					      "(bit %u)",
+					      first + i);
 		}
 		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
 	}
@@ -751,7 +494,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 
 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 		return;
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	for (i = 0; i < count; i++) {
 		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -767,9 +510,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 		b2 = (unsigned char *) bitmap;
 		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 			if (b1[i] != b2[i]) {
-				printk("corruption in group %lu at byte %u(%u):"
-				       " %x in copy != %x on disk/prealloc\n",
-					e4b->bd_group, i, i * 8, b1[i], b2[i]);
+				ext4_msg(e4b->bd_sb, KERN_ERR,
+					 "corruption in group %u "
+					 "at byte %u(%u): %x in copy != %x "
+					 "on disk/prealloc",
+					 e4b->bd_group, i, i * 8, b1[i], b2[i]);
 				BUG();
 			}
 		}
@@ -823,9 +568,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 	void *buddy;
 	void *buddy2;
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
 	{
 		static int mb_check_counter;
 		if (mb_check_counter++ % 100 != 0)
@@ -855,14 +597,14 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 				continue;
 			}
 
-			/* both bits in buddy2 must be 0 */
+			/* both bits in buddy2 must be 1 */
 			MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
 			MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
 
 			for (j = 0; j < (1 << order); j++) {
 				k = (i * (1 << order)) + j;
 				MB_CHECK_ASSERT(
-					!mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+					!mb_test_bit(k, e4b->bd_bitmap));
 			}
 			count++;
 		}
@@ -894,37 +636,41 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 	MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
 
 	grp = ext4_get_group_info(sb, e4b->bd_group);
-	buddy = mb_find_buddy(e4b, 0, &max);
 	list_for_each(cur, &grp->bb_prealloc_list) {
 		ext4_group_t groupnr;
 		struct ext4_prealloc_space *pa;
-		pa = list_entry(cur, struct ext4_prealloc_space, group_list);
-		ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k);
+		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
+		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
 		MB_CHECK_ASSERT(groupnr == e4b->bd_group);
-		for (i = 0; i < pa->len; i++)
+		for (i = 0; i < pa->pa_len; i++)
 			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
 	}
 	return 0;
 }
 #undef MB_CHECK_ASSERT
 #define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
-					__FILE__, __FUNCTION__, __LINE__)
+					__FILE__, __func__, __LINE__)
 #else
 #define mb_check_buddy(e4b)
 #endif
 
-/* FIXME!! need more doc */
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
 static void ext4_mb_mark_free_simple(struct super_block *sb,
-				void *buddy, unsigned first, int len,
+				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 					struct ext4_group_info *grp)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	unsigned short min;
-	unsigned short max;
-	unsigned short chunk;
+	ext4_grpblk_t min;
+	ext4_grpblk_t max;
+	ext4_grpblk_t chunk;
 	unsigned short border;
 
-	BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
+	BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 
 	border = 2 << sb->s_blocksize_bits;
 
@@ -950,14 +696,37 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 	}
 }
 
-static void ext4_mb_generate_buddy(struct super_block *sb,
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+	int i;
+	int bits;
+
+	grp->bb_largest_free_order = -1; /* uninit */
+
+	bits = sb->s_blocksize_bits + 1;
+	for (i = bits; i >= 0; i--) {
+		if (grp->bb_counters[i] > 0) {
+			grp->bb_largest_free_order = i;
+			break;
+		}
+	}
+}
+
+static noinline_for_stack
+void ext4_mb_generate_buddy(struct super_block *sb,
 				void *buddy, void *bitmap, ext4_group_t group)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-	unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
-	unsigned short i = 0;
-	unsigned short first;
-	unsigned short len;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+	ext4_grpblk_t i = 0;
+	ext4_grpblk_t first;
+	ext4_grpblk_t len;
 	unsigned free = 0;
 	unsigned fragments = 0;
 	unsigned long long period = get_cycles();
@@ -982,15 +751,21 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 	grp->bb_fragments = fragments;
 
 	if (free != grp->bb_free) {
-		ext4_error(sb, __FUNCTION__,
-			"EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
-			group, free, grp->bb_free);
+		ext4_grp_locked_error(sb, group, 0, 0,
+				      "block bitmap and bg descriptor "
+				      "inconsistent: %u vs %u free clusters",
+				      free, grp->bb_free);
 		/*
-		 * If we intent to continue, we consider group descritor
+		 * If we intend to continue, we consider group descriptor
 		 * corrupt and update bb_free using bitmap value
 		 */
 		grp->bb_free = free;
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   grp->bb_free);
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
 	}
+	mb_set_largest_free_order(sb, grp);
 
 	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 
@@ -1001,6 +776,24 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 	spin_unlock(&EXT4_SB(sb)->s_bal_lock);
 }
 
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+	int count;
+	int order = 1;
+	void *buddy;
+
+	while ((buddy = mb_find_buddy(e4b, order++, &count))) {
+		ext4_set_bits(buddy, 0, count);
+	}
+	e4b->bd_info->bb_fragments = 0;
+	memset(e4b->bd_info->bb_counters, 0,
+		sizeof(*e4b->bd_info->bb_counters) *
+		(e4b->bd_sb->s_blocksize_bits + 2));
+
+	ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+		e4b->bd_bitmap, e4b->bd_group);
+}
+
 /* The buddy information is attached the buddy cache inode
  * for convenience. The information regarding each group
  * is loaded via ext4_mb_load_buddy. The information involve
@@ -1008,7 +801,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
  * stored in the inode as
  *
  * {                        page                        }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  *
  *
  * one block each for bitmap and buddy information.
@@ -1016,28 +809,34 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
  * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
  * So it can have information regarding groups_per_page which
  * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
  */
 
 static int ext4_mb_init_cache(struct page *page, char *incore)
 {
+	ext4_group_t ngroups;
 	int blocksize;
 	int blocks_per_page;
 	int groups_per_page;
 	int err = 0;
 	int i;
-	ext4_group_t first_group;
+	ext4_group_t first_group, group;
 	int first_block;
 	struct super_block *sb;
 	struct buffer_head *bhs;
-	struct buffer_head **bh;
+	struct buffer_head **bh = NULL;
 	struct inode *inode;
 	char *data;
 	char *bitmap;
+	struct ext4_group_info *grinfo;
 
-	mb_debug("init page %lu\n", page->index);
+	mb_debug(1, "init page %lu\n", page->index);
 
 	inode = page->mapping->host;
 	sb = inode->i_sb;
+	ngroups = ext4_get_groups_count(sb);
 	blocksize = 1 << inode->i_blkbits;
 	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
 
@@ -1047,67 +846,58 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 	/* allocate buffer_heads to read bitmaps */
 	if (groups_per_page > 1) {
-		err = -ENOMEM;
 		i = sizeof(struct buffer_head *) * groups_per_page;
 		bh = kzalloc(i, GFP_NOFS);
-		if (bh == NULL)
+		if (bh == NULL) {
+			err = -ENOMEM;
 			goto out;
+		}
 	} else
 		bh = &bhs;
 
 	first_group = page->index * blocks_per_page / 2;
 
 	/* read all groups the page covers into the cache */
-	for (i = 0; i < groups_per_page; i++) {
-		struct ext4_group_desc *desc;
-
-		if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+		if (group >= ngroups)
 			break;
 
-		err = -EIO;
-		desc = ext4_get_group_desc(sb, first_group + i, NULL);
-		if (desc == NULL)
-			goto out;
-
-		err = -ENOMEM;
-		bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
-		if (bh[i] == NULL)
-			goto out;
-
-		if (bh_uptodate_or_lock(bh[i]))
-			continue;
-
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			ext4_init_block_bitmap(sb, bh[i],
-						first_group + i, desc);
-			set_buffer_uptodate(bh[i]);
-			unlock_buffer(bh[i]);
+		grinfo = ext4_get_group_info(sb, group);
+		/*
+		 * If page is uptodate then we came here after online resize
+		 * which added some new uninitialized group info structs, so
+		 * we must skip all initialized uptodate buddies on the page,
+		 * which may be currently in use by an allocating task.
+		 */
+		if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+			bh[i] = NULL;
 			continue;
 		}
-		get_bh(bh[i]);
-		bh[i]->b_end_io = end_buffer_read_sync;
-		submit_bh(READ, bh[i]);
-		mb_debug("read bitmap for group %lu\n", first_group + i);
+		if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
+			err = -ENOMEM;
+			goto out;
+		}
+		mb_debug(1, "read bitmap for group %u\n", group);
 	}
 
 	/* wait for I/O completion */
-	for (i = 0; i < groups_per_page && bh[i]; i++)
-		wait_on_buffer(bh[i]);
-
-	err = -EIO;
-	for (i = 0; i < groups_per_page && bh[i]; i++)
-		if (!buffer_uptodate(bh[i]))
+	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+		if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+			err = -EIO;
 			goto out;
+		}
+	}
 
 	first_block = page->index * blocks_per_page;
 	for (i = 0; i < blocks_per_page; i++) {
-		int group;
-		struct ext4_group_info *grinfo;
-
 		group = (first_block + i) >> 1;
-		if (group >= EXT4_SB(sb)->s_groups_count)
+		if (group >= ngroups)
 			break;
 
+		if (!bh[group - first_group])
+			/* skip initialized uptodate buddy */
+			continue;
+
 		/*
 		 * data carry information regarding this
 		 * particular group in the format specified
@@ -1124,23 +914,29 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if ((first_block + i) & 1) {
 			/* this is block of buddy */
 			BUG_ON(incore == NULL);
-			mb_debug("put buddy for group %u in page %lu/%x\n",
+			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
-			memset(data, 0xff, blocksize);
+			trace_ext4_mb_buddy_bitmap_load(sb, group);
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
 			memset(grinfo->bb_counters, 0,
-			       sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+			       sizeof(*grinfo->bb_counters) *
+				(sb->s_blocksize_bits+2));
 			/*
 			 * incore got set to the group block bitmap below
 			 */
+			ext4_lock_group(sb, group);
+			/* init the buddy */
+			memset(data, 0xff, blocksize);
 			ext4_mb_generate_buddy(sb, data, incore, group);
+			ext4_unlock_group(sb, group);
 			incore = NULL;
 		} else {
 			/* this is block of bitmap */
 			BUG_ON(incore != NULL);
-			mb_debug("put bitmap for group %u in page %lu/%x\n",
+			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
+			trace_ext4_mb_bitmap_load(sb, group);
 
 			/* see comments in ext4_mb_put_pa() */
 			ext4_lock_group(sb, group);
@@ -1148,6 +944,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 			/* mark all preallocated blks used in in-core bitmap */
 			ext4_mb_generate_from_pa(sb, data, group);
+			ext4_mb_generate_from_freelist(sb, data, group);
 			ext4_unlock_group(sb, group);
 
 			/* set incore so that the buddy information can be
@@ -1160,7 +957,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 out:
 	if (bh) {
-		for (i = 0; i < groups_per_page && bh[i]; i++)
+		for (i = 0; i < groups_per_page; i++)
 			brelse(bh[i]);
 		if (bh != &bhs)
 			kfree(bh);
@@ -1168,28 +965,175 @@ out:
 	return err;
 }
 
-static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
-		struct ext4_buddy *e4b)
+/*
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
+ */
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+	int block, pnum, poff;
+	int blocks_per_page;
+	struct page *page;
+
+	e4b->bd_buddy_page = NULL;
+	e4b->bd_bitmap_page = NULL;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+	BUG_ON(page->mapping != inode->i_mapping);
+	e4b->bd_bitmap_page = page;
+	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+	if (blocks_per_page >= 2) {
+		/* buddy and bitmap are on the same page */
+		return 0;
+	}
+
+	block++;
+	pnum = block / blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+	BUG_ON(page->mapping != inode->i_mapping);
+	e4b->bd_buddy_page = page;
+	return 0;
+}
+
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
+{
+	if (e4b->bd_bitmap_page) {
+		unlock_page(e4b->bd_bitmap_page);
+		page_cache_release(e4b->bd_bitmap_page);
+	}
+	if (e4b->bd_buddy_page) {
+		unlock_page(e4b->bd_buddy_page);
+		page_cache_release(e4b->bd_buddy_page);
+	}
+}
+
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+	struct ext4_group_info *this_grp;
+	struct ext4_buddy e4b;
+	struct page *page;
+	int ret = 0;
+
+	might_sleep();
+	mb_debug(1, "init group %u\n", group);
+	this_grp = ext4_get_group_info(sb, group);
+	/*
+	 * This ensures that we don't reinit the buddy cache
+	 * page which map to the group from which we are already
+	 * allocating. If we are looking at the buddy cache we would
+	 * have taken a reference using ext4_mb_load_buddy and that
+	 * would have pinned buddy page to page cache.
+	 * The call to ext4_mb_get_buddy_page_lock will mark the
+	 * page accessed.
+	 */
+	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
+		/*
+		 * somebody initialized the group
+		 * return without doing anything
+		 */
+		goto err;
+	}
+
+	page = e4b.bd_bitmap_page;
+	ret = ext4_mb_init_cache(page, NULL);
+	if (ret)
+		goto err;
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+
+	if (e4b.bd_buddy_page == NULL) {
+		/*
+		 * If both the bitmap and buddy are in
+		 * the same page we don't need to force
+		 * init the buddy
+		 */
+		ret = 0;
+		goto err;
+	}
+	/* init buddy cache */
+	page = e4b.bd_buddy_page;
+	ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+	if (ret)
+		goto err;
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+err:
+	ext4_mb_put_buddy_page_lock(&e4b);
+	return ret;
+}
+
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
+static noinline_for_stack int
+ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+					struct ext4_buddy *e4b)
 {
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct inode *inode = sbi->s_buddy_cache;
 	int blocks_per_page;
 	int block;
 	int pnum;
 	int poff;
 	struct page *page;
+	int ret;
+	struct ext4_group_info *grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
 
-	mb_debug("load group %lu\n", group);
+	might_sleep();
+	mb_debug(1, "load group %u\n", group);
 
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	grp = ext4_get_group_info(sb, group);
 
 	e4b->bd_blkbits = sb->s_blocksize_bits;
-	e4b->bd_info = ext4_get_group_info(sb, group);
+	e4b->bd_info = grp;
 	e4b->bd_sb = sb;
 	e4b->bd_group = group;
 	e4b->bd_buddy_page = NULL;
 	e4b->bd_bitmap_page = NULL;
 
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		/*
+		 * we need full data about the group
+		 * to make a good selection
+		 */
+		ret = ext4_mb_init_group(sb, group);
+		if (ret)
+			return ret;
+	}
+
 	/*
 	 * the buddy cache inode stores the block bitmap
 	 * and buddy information in consecutive blocks.
@@ -1201,49 +1145,79 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 
 	/* we could use find_or_create_page(), but it locks page
 	 * what we'd like to avoid in fast path ... */
-	page = find_get_page(inode->i_mapping, pnum);
+	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
 	if (page == NULL || !PageUptodate(page)) {
 		if (page)
+			/*
+			 * drop the page reference and try
+			 * to get the page with lock. If we
+			 * are not uptodate that implies
+			 * somebody just created the page but
+			 * is yet to initialize the same. So
+			 * wait for it to initialize.
+			 */
 			page_cache_release(page);
 		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
 			if (!PageUptodate(page)) {
-				ext4_mb_init_cache(page, NULL);
+				ret = ext4_mb_init_cache(page, NULL);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
 				mb_cmp_bitmaps(e4b, page_address(page) +
 					       (poff * sb->s_blocksize));
 			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL) {
+		ret = -ENOMEM;
 		goto err;
+	}
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+
+	/* Pages marked accessed already */
 	e4b->bd_bitmap_page = page;
 	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
-	mark_page_accessed(page);
 
 	block++;
 	pnum = block / blocks_per_page;
 	poff = block % blocks_per_page;
 
-	page = find_get_page(inode->i_mapping, pnum);
+	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
 	if (page == NULL || !PageUptodate(page)) {
 		if (page)
 			page_cache_release(page);
 		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
-			if (!PageUptodate(page))
-				ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+			if (!PageUptodate(page)) {
+				ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
+			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL) {
+		ret = -ENOMEM;
 		goto err;
+	}
+	if (!PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+
+	/* Pages marked accessed already */
 	e4b->bd_buddy_page = page;
 	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
-	mark_page_accessed(page);
 
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
@@ -1251,16 +1225,18 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	return 0;
 
 err:
+	if (page)
+		page_cache_release(page);
 	if (e4b->bd_bitmap_page)
 		page_cache_release(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
 		page_cache_release(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
-	return -EIO;
+	return ret;
 }
 
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
 	if (e4b->bd_bitmap_page)
 		page_cache_release(e4b->bd_bitmap_page);
@@ -1274,10 +1250,10 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 	int order = 1;
 	void *bb;
 
-	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
 
-	bb = EXT4_MB_BUDDY(e4b);
+	bb = e4b->bd_buddy;
 	while (order <= e4b->bd_blkbits + 1) {
 		block = block >> 1;
 		if (!mb_test_bit(block, bb)) {
@@ -1290,7 +1266,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 	return 0;
 }
 
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
 
@@ -1303,12 +1279,39 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		mb_clear_bit_atomic(lock, cur, bm);
+		mb_clear_bit(cur, bm);
 		cur++;
 	}
 }
 
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+/* clear bits in given range
+ * will return first found zero bit if any, -1 otherwise
+ */
+static int mb_test_and_clear_bits(void *bm, int cur, int len)
+{
+	__u32 *addr;
+	int zero_bit = -1;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: clear whole word at once */
+			addr = bm + (cur >> 3);
+			if (*addr != (__u32)(-1) && zero_bit == -1)
+				zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
+			*addr = 0;
+			cur += 32;
+			continue;
+		}
+		if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
+			zero_bit = cur;
+		cur++;
+	}
+
+	return zero_bit;
+}
+
+void ext4_set_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
 
@@ -1321,23 +1324,100 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		mb_set_bit_atomic(lock, cur, bm);
+		mb_set_bit(cur, bm);
 		cur++;
 	}
 }
 
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
-			  int first, int count)
+/*
+ * _________________________________________________________________ */
+
+static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
 {
-	int block = 0;
-	int max = 0;
-	int order;
-	void *buddy;
-	void *buddy2;
+	if (mb_test_bit(*bit + side, bitmap)) {
+		mb_clear_bit(*bit, bitmap);
+		(*bit) -= side;
+		return 1;
+	}
+	else {
+		(*bit) += side;
+		mb_set_bit(*bit, bitmap);
+		return -1;
+	}
+}
+
+static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
+{
+	int max;
+	int order = 1;
+	void *buddy = mb_find_buddy(e4b, order, &max);
+
+	while (buddy) {
+		void *buddy2;
+
+		/* Bits in range [first; last] are known to be set since
+		 * corresponding blocks were allocated. Bits in range
+		 * (first; last) will stay set because they form buddies on
+		 * upper layer. We just deal with borders if they don't
+		 * align with upper layer and then go up.
+		 * Releasing entire group is all about clearing
+		 * single bit of highest order buddy.
+		 */
+
+		/* Example:
+		 * ---------------------------------
+		 * |   1   |   1   |   1   |   1   |
+		 * ---------------------------------
+		 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
+		 * ---------------------------------
+		 *   0   1   2   3   4   5   6   7
+		 *      \_____________________/
+		 *
+		 * Neither [1] nor [6] is aligned to above layer.
+		 * Left neighbour [0] is free, so mark it busy,
+		 * decrease bb_counters and extend range to
+		 * [0; 6]
+		 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
+		 * mark [6] free, increase bb_counters and shrink range to
+		 * [0; 5].
+		 * Then shift range to [0; 2], go up and do the same.
+		 */
+
+
+		if (first & 1)
+			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
+		if (!(last & 1))
+			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
+		if (first > last)
+			break;
+		order++;
+
+		if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+			mb_clear_bits(buddy, first, last - first + 1);
+			e4b->bd_info->bb_counters[order - 1] += last - first + 1;
+			break;
+		}
+		first >>= 1;
+		last >>= 1;
+		buddy = buddy2;
+	}
+}
+
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+			   int first, int count)
+{
+	int left_is_free = 0;
+	int right_is_free = 0;
+	int block;
+	int last = first + count - 1;
 	struct super_block *sb = e4b->bd_sb;
 
-	BUG_ON(first + count > (sb->s_blocksize << 3));
-	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	BUG_ON(last >= (sb->s_blocksize << 3));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
+	/* Don't bother if the block group is corrupt. */
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+		return;
+
 	mb_check_buddy(e4b);
 	mb_free_blocks_double(inode, e4b, first, count);
 
@@ -1345,85 +1425,77 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	if (first < e4b->bd_info->bb_first_free)
 		e4b->bd_info->bb_first_free = first;
 
-	/* let's maintain fragments counter */
+	/* access memory sequentially: check left neighbour,
+	 * clear range and then check right neighbour
+	 */
 	if (first != 0)
-		block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
-	if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
-		max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
-	if (block && max)
+		left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
+	block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
+	if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
+		right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
+
+	if (unlikely(block != -1)) {
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		ext4_fsblk_t blocknr;
+
+		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
+		blocknr += EXT4_C2B(EXT4_SB(sb), block);
+		ext4_grp_locked_error(sb, e4b->bd_group,
+				      inode ? inode->i_ino : 0,
+				      blocknr,
+				      "freeing already freed block "
+				      "(bit %u); block bitmap corrupt.",
+				      block);
+		if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
+			percpu_counter_sub(&sbi->s_freeclusters_counter,
+					   e4b->bd_info->bb_free);
+		/* Mark the block group as corrupt. */
+		set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+			&e4b->bd_info->bb_state);
+		mb_regenerate_buddy(e4b);
+		goto done;
+	}
+
+	/* let's maintain fragments counter */
+	if (left_is_free && right_is_free)
 		e4b->bd_info->bb_fragments--;
-	else if (!block && !max)
+	else if (!left_is_free && !right_is_free)
 		e4b->bd_info->bb_fragments++;
 
-	/* let's maintain buddy itself */
-	while (count-- > 0) {
-		block = first++;
-		order = 0;
-
-		if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
-			ext4_fsblk_t blocknr;
-			blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
-			blocknr += block;
-			blocknr +=
-			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
-			ext4_error(sb, __FUNCTION__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %lu)\n",
-				   inode ? inode->i_ino : 0, blocknr, block,
-				   e4b->bd_group);
-		}
-		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
-		e4b->bd_info->bb_counters[order]++;
-
-		/* start of the buddy */
-		buddy = mb_find_buddy(e4b, order, &max);
-
-		do {
-			block &= ~1UL;
-			if (mb_test_bit(block, buddy) ||
-					mb_test_bit(block + 1, buddy))
-				break;
-
-			/* both the buddies are free, try to coalesce them */
-			buddy2 = mb_find_buddy(e4b, order + 1, &max);
-
-			if (!buddy2)
-				break;
-
-			if (order > 0) {
-				/* for special purposes, we don't set
-				 * free bits in bitmap */
-				mb_set_bit(block, buddy);
-				mb_set_bit(block + 1, buddy);
-			}
-			e4b->bd_info->bb_counters[order]--;
-			e4b->bd_info->bb_counters[order]--;
+	/* buddy[0] == bd_bitmap is a special case, so handle
+	 * it right away and let mb_buddy_mark_free stay free of
+	 * zero order checks.
+	 * Check if neighbours are to be coaleasced,
+	 * adjust bitmap bb_counters and borders appropriately.
+	 */
+	if (first & 1) {
+		first += !left_is_free;
+		e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
+	}
+	if (!(last & 1)) {
+		last -= !right_is_free;
+		e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
+	}
 
-			block = block >> 1;
-			order++;
-			e4b->bd_info->bb_counters[order]++;
+	if (first <= last)
+		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
 
-			mb_clear_bit(block, buddy2);
-			buddy = buddy2;
-		} while (1);
-	}
+done:
+	mb_set_largest_free_order(sb, e4b->bd_info);
 	mb_check_buddy(e4b);
-
-	return 0;
 }
 
-static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
+static int mb_find_extent(struct ext4_buddy *e4b, int block,
 				int needed, struct ext4_free_extent *ex)
 {
 	int next = block;
-	int max;
-	int ord;
+	int max, order;
 	void *buddy;
 
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	BUG_ON(ex == NULL);
 
-	buddy = mb_find_buddy(e4b, order, &max);
+	buddy = mb_find_buddy(e4b, 0, &max);
 	BUG_ON(buddy == NULL);
 	BUG_ON(block >= max);
 	if (mb_test_bit(block, buddy)) {
@@ -1433,12 +1505,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
 		return 0;
 	}
 
-	/* FIXME dorp order completely ? */
-	if (likely(order == 0)) {
-		/* find actual order */
-		order = mb_find_order_for_block(e4b, block);
-		block = block >> order;
-	}
+	/* find actual order */
+	order = mb_find_order_for_block(e4b, block);
+	block = block >> order;
 
 	ex->fe_len = 1 << order;
 	ex->fe_start = block << order;
@@ -1450,18 +1519,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
 	ex->fe_start += next;
 
 	while (needed > ex->fe_len &&
-	       (buddy = mb_find_buddy(e4b, order, &max))) {
+	       mb_find_buddy(e4b, order, &max)) {
 
 		if (block + 1 >= max)
 			break;
 
 		next = (block + 1) * (1 << order);
-		if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+		if (mb_test_bit(next, e4b->bd_bitmap))
 			break;
 
-		ord = mb_find_order_for_block(e4b, next);
+		order = mb_find_order_for_block(e4b, next);
 
-		order = ord;
 		block = next >> order;
 		ex->fe_len += 1 << order;
 	}
@@ -1484,7 +1552,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 
 	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
 	BUG_ON(e4b->bd_group != ex->fe_group);
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	mb_check_buddy(e4b);
 	mb_mark_used_double(e4b, start, len);
 
@@ -1494,9 +1562,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 
 	/* let's maintain fragments counter */
 	if (start != 0)
-		mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+		mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
 	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
-		max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+		max = !mb_test_bit(start + len, e4b->bd_bitmap);
 	if (mlen && max)
 		e4b->bd_info->bb_fragments++;
 	else if (!mlen && !max)
@@ -1537,9 +1605,9 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 		e4b->bd_info->bb_counters[ord]++;
 		e4b->bd_info->bb_counters[ord]++;
 	}
+	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
 
-	mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
-			EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+	ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
 	mb_check_buddy(e4b);
 
 	return ret;
@@ -1569,15 +1637,19 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 	ac->ac_tail = ret & 0xffff;
 	ac->ac_buddy = ret >> 16;
 
-	/* XXXXXXX: SUCH A HORRIBLE **CK */
-	/*FIXME!! Why ? */
+	/*
+	 * take the page reference. We want the page to be pinned
+	 * so that we don't get a ext4_mb_init_cache_call for this
+	 * group until we update the bitmap. That would mean we
+	 * double allocate blocks. The reference is dropped
+	 * in ext4_mb_release_context
+	 */
 	ac->ac_bitmap_page = e4b->bd_bitmap_page;
 	get_page(ac->ac_bitmap_page);
 	ac->ac_buddy_page = e4b->bd_buddy_page;
 	get_page(ac->ac_buddy_page);
-
 	/* store last allocated for subsequent stream allocation */
-	if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
 		spin_lock(&sbi->s_md_lock);
 		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
 		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1599,6 +1671,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
 	struct ext4_free_extent ex;
 	int max;
 
+	if (ac->ac_status == AC_STATUS_FOUND)
+		return;
 	/*
 	 * We don't want to scan for a whole year
 	 */
@@ -1619,7 +1693,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
 		/* recheck chunk's availability - we don't know
 		 * when it was found (within this lock-unlock
 		 * period or not) */
-		max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
+		max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
 		if (max >= gex->fe_len) {
 			ext4_mb_use_best_found(ac, e4b);
 			return;
@@ -1645,8 +1719,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
 	struct ext4_free_extent *gex = &ac->ac_g_ex;
 
 	BUG_ON(ex->fe_len <= 0);
-	BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
-	BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
 	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
 
 	ac->ac_found++;
@@ -1696,7 +1770,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
 	ext4_mb_check_limits(ac, e4b, 0);
 }
 
-static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
 					struct ext4_buddy *e4b)
 {
 	struct ext4_free_extent ex = ac->ac_b_ex;
@@ -1710,7 +1785,7 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
 		return err;
 
 	ext4_lock_group(ac->ac_sb, group);
-	max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
+	max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
 
 	if (max > 0) {
 		ac->ac_b_ex = ex;
@@ -1718,37 +1793,46 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
 	}
 
 	ext4_unlock_group(ac->ac_sb, group);
-	ext4_mb_release_desc(e4b);
+	ext4_mb_unload_buddy(e4b);
 
 	return 0;
 }
 
-static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
+static noinline_for_stack
+int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 				struct ext4_buddy *e4b)
 {
 	ext4_group_t group = ac->ac_g_ex.fe_group;
 	int max;
 	int err;
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
-	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 	struct ext4_free_extent ex;
 
 	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
 		return 0;
+	if (grp->bb_free == 0)
+		return 0;
 
 	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
 	if (err)
 		return err;
 
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
+		ext4_mb_unload_buddy(e4b);
+		return 0;
+	}
+
 	ext4_lock_group(ac->ac_sb, group);
-	max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
+	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
 			     ac->ac_g_ex.fe_len, &ex);
+	ex.fe_logical = 0xDEADFA11; /* debug value */
 
 	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
 		ext4_fsblk_t start;
 
-		start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) +
-			ex.fe_start + le32_to_cpu(es->s_first_data_block);
+		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
+			ex.fe_start;
 		/* use do_div to get remainder (would be 64-bit modulo) */
 		if (do_div(start, sbi->s_stripe) == 0) {
 			ac->ac_found++;
@@ -1773,7 +1857,7 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 		ext4_mb_use_best_found(ac, e4b);
 	}
 	ext4_unlock_group(ac->ac_sb, group);
-	ext4_mb_release_desc(e4b);
+	ext4_mb_unload_buddy(e4b);
 
 	return 0;
 }
@@ -1782,7 +1866,8 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
  * The routine scans buddy structures (not bitmap!) from given order
  * to max order and tries to find big enough chunk to satisfy the req
  */
-static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
 					struct ext4_buddy *e4b)
 {
 	struct super_block *sb = ac->ac_sb;
@@ -1825,11 +1910,12 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
  * In order to optimize scanning, caller must pass number of
  * free blocks in the group, so the routine can know upper limit.
  */
-static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 					struct ext4_buddy *e4b)
 {
 	struct super_block *sb = ac->ac_sb;
-	void *bitmap = EXT4_MB_BITMAP(e4b);
+	void *bitmap = e4b->bd_bitmap;
 	struct ext4_free_extent ex;
 	int i;
 	int free;
@@ -1841,24 +1927,26 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 
 	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
 		i = mb_find_next_zero_bit(bitmap,
-						EXT4_BLOCKS_PER_GROUP(sb), i);
-		if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
+						EXT4_CLUSTERS_PER_GROUP(sb), i);
+		if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
 			/*
 			 * IF we have corrupt bitmap, we won't find any
 			 * free blocks even though group info says we
 			 * we have free blocks
 			 */
-			ext4_error(sb, __FUNCTION__, "%d free blocks as per "
-					"group info. But bitmap says 0\n",
+			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+					"%d free clusters as per "
+					"group info. But bitmap says 0",
 					free);
 			break;
 		}
 
-		mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
+		mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
 		BUG_ON(ex.fe_len <= 0);
 		if (free < ex.fe_len) {
-			ext4_error(sb, __FUNCTION__, "%d free blocks as per "
-					"group info. But got %d blocks\n",
+			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
+					"%d free clusters as per "
+					"group info. But got %d blocks",
 					free, ex.fe_len);
 			/*
 			 * The number of free blocks differs. This mostly
@@ -1867,7 +1955,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			 */
 			break;
 		}
-
+		ex.fe_logical = 0xDEADC0DE; /* debug value */
 		ext4_mb_measure_extent(ac, &ex, e4b);
 
 		i += ex.fe_len;
@@ -1879,15 +1967,15 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 
 /*
  * This is a special case for storages like raid5
- * we try to find stripe-aligned chunks for stripe-size requests
- * XXX should do so at least for multiples of stripe size as well
+ * we try to find stripe-aligned chunks for stripe-size-multiple requests
  */
-static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
+static noinline_for_stack
+void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 				 struct ext4_buddy *e4b)
 {
 	struct super_block *sb = ac->ac_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	void *bitmap = EXT4_MB_BITMAP(e4b);
+	void *bitmap = e4b->bd_bitmap;
 	struct ext4_free_extent ex;
 	ext4_fsblk_t first_group_block;
 	ext4_fsblk_t a;
@@ -1897,17 +1985,18 @@ static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	BUG_ON(sbi->s_stripe == 0);
 
 	/* find first stripe-aligned block in group */
-	first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb)
-		+ le32_to_cpu(sbi->s_es->s_first_data_block);
+	first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
+
 	a = first_group_block + sbi->s_stripe - 1;
 	do_div(a, sbi->s_stripe);
 	i = (a * sbi->s_stripe) - first_group_block;
 
-	while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
+	while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
 		if (!mb_test_bit(i, bitmap)) {
-			max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
+			max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
 			if (max >= sbi->s_stripe) {
 				ac->ac_found++;
+				ex.fe_logical = 0xDEADF00D; /* debug value */
 				ac->ac_b_ex = ex;
 				ext4_mb_use_best_found(ac, e4b);
 				break;
@@ -1917,37 +2006,54 @@ static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	}
 }
 
+/* This is now called BEFORE we load the buddy bitmap. */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 				ext4_group_t group, int cr)
 {
 	unsigned free, fragments;
-	unsigned i, bits;
-	struct ext4_group_desc *desc;
+	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
 	BUG_ON(cr < 0 || cr >= 4);
-	BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
 
 	free = grp->bb_free;
-	fragments = grp->bb_fragments;
 	if (free == 0)
 		return 0;
+	if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+		return 0;
+
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+		return 0;
+
+	/* We only do this if the grp has never been initialized */
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		int ret = ext4_mb_init_group(ac->ac_sb, group);
+		if (ret)
+			return 0;
+	}
+
+	fragments = grp->bb_fragments;
 	if (fragments == 0)
 		return 0;
 
 	switch (cr) {
 	case 0:
 		BUG_ON(ac->ac_2order == 0);
-		/* If this group is uninitialized, skip it initially */
-		desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
+
+		/* Avoid using the first bg of a flexgroup for data files */
+		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+		    ((group % flex_size) == 0))
 			return 0;
 
-		bits = ac->ac_sb->s_blocksize_bits + 1;
-		for (i = ac->ac_2order; i <= bits; i++)
-			if (grp->bb_counters[i] > 0)
-				return 1;
-		break;
+		if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
+		    (free / fragments) >= ac->ac_g_ex.fe_len)
+			return 1;
+
+		if (grp->bb_largest_free_order < ac->ac_2order)
+			return 0;
+
+		return 1;
 	case 1:
 		if ((free / fragments) >= ac->ac_g_ex.fe_len)
 			return 1;
@@ -1965,20 +2071,23 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
-static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-	ext4_group_t group;
-	ext4_group_t i;
+	ext4_group_t ngroups, group, i;
 	int cr;
 	int err = 0;
-	int bsbits;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	struct ext4_buddy e4b;
-	loff_t size, isize;
 
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
+	ngroups = ext4_get_groups_count(sb);
+	/* non-extent files are limited to low blocks/groups */
+	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+		ngroups = sbi->s_blockfile_groups;
+
 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
 
 	/* first, try the goal */
@@ -1999,7 +2108,7 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	/*
 	 * We search using buddy data only if the order of the request
 	 * is greater than equal to the sbi_s_mb_order2_reqs
-	 * You can tune it via /proc/fs/ext4/<partition>/order2_req
+	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
 	 */
 	if (i >= sbi->s_mb_order2_reqs) {
 		/*
@@ -2009,15 +2118,8 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 			ac->ac_2order = i - 1;
 	}
 
-	bsbits = ac->ac_sb->s_blocksize_bits;
 	/* if stream allocation is enabled, use global goal */
-	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
-	isize = i_size_read(ac->ac_inode) >> bsbits;
-	if (size < isize)
-		size = isize;
-
-	if (size < sbi->s_mb_stream_request &&
-			(ac->ac_flags & EXT4_MB_HINT_DATA)) {
+	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
 		/* TBD: may be hot point */
 		spin_lock(&sbi->s_md_lock);
 		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
@@ -2025,9 +2127,6 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		spin_unlock(&sbi->s_md_lock);
 	}
 
-	/* searching for the right group start from the goal value specified */
-	group = ac->ac_g_ex.fe_group;
-
 	/* Let's just scan groups to find more-less suitable blocks */
 	cr = ac->ac_2order ? 0 : 1;
 	/*
@@ -2037,37 +2136,22 @@ static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 repeat:
 	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
 		ac->ac_criteria = cr;
-		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
-			struct ext4_group_info *grp;
-			struct ext4_group_desc *desc;
-
-			if (group == EXT4_SB(sb)->s_groups_count)
-				group = 0;
-
-			/* quick check to skip empty groups */
-			grp = ext4_get_group_info(ac->ac_sb, group);
-			if (grp->bb_free == 0)
-				continue;
+		/*
+		 * searching for the right group start
+		 * from the goal value specified
+		 */
+		group = ac->ac_g_ex.fe_group;
 
+		for (i = 0; i < ngroups; group++, i++) {
+			cond_resched();
 			/*
-			 * if the group is already init we check whether it is
-			 * a good group and if not we don't load the buddy
+			 * Artificially restricted ngroups for non-extent
+			 * files makes group > ngroups possible on first loop.
 			 */
-			if (EXT4_MB_GRP_NEED_INIT(grp)) {
-				/*
-				 * we need full data about the group
-				 * to make a good selection
-				 */
-				err = ext4_mb_load_buddy(sb, group, &e4b);
-				if (err)
-					goto out;
-				ext4_mb_release_desc(&e4b);
-			}
+			if (group >= ngroups)
+				group = 0;
 
-			/*
-			 * If the particular group doesn't satisfy our
-			 * criteria we continue with the next group
-			 */
+			/* This now checks without needing the buddy page */
 			if (!ext4_mb_good_group(ac, group, cr))
 				continue;
 
@@ -2076,27 +2160,28 @@ repeat:
 				goto out;
 
 			ext4_lock_group(sb, group);
+
+			/*
+			 * We need to check again after locking the
+			 * block group
+			 */
 			if (!ext4_mb_good_group(ac, group, cr)) {
-				/* someone did allocation from this group */
 				ext4_unlock_group(sb, group);
-				ext4_mb_release_desc(&e4b);
+				ext4_mb_unload_buddy(&e4b);
 				continue;
 			}
 
 			ac->ac_groups_scanned++;
-			desc = ext4_get_group_desc(sb, group, NULL);
-			if (cr == 0 || (desc->bg_flags &
-					cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
-					ac->ac_2order != 0))
+			if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
 				ext4_mb_simple_scan_group(ac, &e4b);
-			else if (cr == 1 &&
-					ac->ac_g_ex.fe_len == sbi->s_stripe)
+			else if (cr == 1 && sbi->s_stripe &&
+					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
 				ext4_mb_scan_aligned(ac, &e4b);
 			else
 				ext4_mb_complex_scan_group(ac, &e4b);
 
 			ext4_unlock_group(sb, group);
-			ext4_mb_release_desc(&e4b);
+			ext4_mb_unload_buddy(&e4b);
 
 			if (ac->ac_status != AC_STATUS_CONTINUE)
 				break;
@@ -2132,241 +2217,40 @@ out:
 	return err;
 }
 
-#ifdef EXT4_MB_HISTORY
-struct ext4_mb_proc_session {
-	struct ext4_mb_history *history;
-	struct super_block *sb;
-	int start;
-	int max;
-};
-
-static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
-					struct ext4_mb_history *hs,
-					int first)
-{
-	if (hs == s->history + s->max)
-		hs = s->history;
-	if (!first && hs == s->history + s->start)
-		return NULL;
-	while (hs->orig.fe_len == 0) {
-		hs++;
-		if (hs == s->history + s->max)
-			hs = s->history;
-		if (hs == s->history + s->start)
-			return NULL;
-	}
-	return hs;
-}
-
-static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
-	struct ext4_mb_proc_session *s = seq->private;
-	struct ext4_mb_history *hs;
-	int l = *pos;
-
-	if (l == 0)
-		return SEQ_START_TOKEN;
-	hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
-	if (!hs)
-		return NULL;
-	while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
-	return hs;
-}
-
-static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
-				      loff_t *pos)
-{
-	struct ext4_mb_proc_session *s = seq->private;
-	struct ext4_mb_history *hs = v;
-
-	++*pos;
-	if (v == SEQ_START_TOKEN)
-		return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
-	else
-		return ext4_mb_history_skip_empty(s, ++hs, 0);
-}
-
-static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
-{
-	char buf[25], buf2[25], buf3[25], *fmt;
-	struct ext4_mb_history *hs = v;
-
-	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
-				"%-5s %-2s %-5s %-5s %-5s %-6s\n",
-			  "pid", "inode", "original", "goal", "result", "found",
-			   "grps", "cr", "flags", "merge", "tail", "broken");
-		return 0;
-	}
-
-	if (hs->op == EXT4_MB_HISTORY_ALLOC) {
-		fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
-			"%-5u %-5s %-5u %-6u\n";
-		sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
-			hs->result.fe_start, hs->result.fe_len,
-			hs->result.fe_logical);
-		sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
-			hs->orig.fe_start, hs->orig.fe_len,
-			hs->orig.fe_logical);
-		sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
-			hs->goal.fe_start, hs->goal.fe_len,
-			hs->goal.fe_logical);
-		seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
-				hs->found, hs->groups, hs->cr, hs->flags,
-				hs->merged ? "M" : "", hs->tail,
-				hs->buddy ? 1 << hs->buddy : 0);
-	} else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
-		fmt = "%-5u %-8u %-23s %-23s %-23s\n";
-		sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
-			hs->result.fe_start, hs->result.fe_len,
-			hs->result.fe_logical);
-		sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
-			hs->orig.fe_start, hs->orig.fe_len,
-			hs->orig.fe_logical);
-		seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
-	} else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
-		sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
-			hs->result.fe_start, hs->result.fe_len);
-		seq_printf(seq, "%-5u %-8u %-23s discard\n",
-				hs->pid, hs->ino, buf2);
-	} else if (hs->op == EXT4_MB_HISTORY_FREE) {
-		sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
-			hs->result.fe_start, hs->result.fe_len);
-		seq_printf(seq, "%-5u %-8u %-23s free\n",
-				hs->pid, hs->ino, buf2);
-	}
-	return 0;
-}
-
-static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-
-static struct seq_operations ext4_mb_seq_history_ops = {
-	.start  = ext4_mb_seq_history_start,
-	.next   = ext4_mb_seq_history_next,
-	.stop   = ext4_mb_seq_history_stop,
-	.show   = ext4_mb_seq_history_show,
-};
-
-static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
-{
-	struct super_block *sb = PDE(inode)->data;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_mb_proc_session *s;
-	int rc;
-	int size;
-
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (s == NULL)
-		return -ENOMEM;
-	s->sb = sb;
-	size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
-	s->history = kmalloc(size, GFP_KERNEL);
-	if (s->history == NULL) {
-		kfree(s);
-		return -ENOMEM;
-	}
-
-	spin_lock(&sbi->s_mb_history_lock);
-	memcpy(s->history, sbi->s_mb_history, size);
-	s->max = sbi->s_mb_history_max;
-	s->start = sbi->s_mb_history_cur % s->max;
-	spin_unlock(&sbi->s_mb_history_lock);
-
-	rc = seq_open(file, &ext4_mb_seq_history_ops);
-	if (rc == 0) {
-		struct seq_file *m = (struct seq_file *)file->private_data;
-		m->private = s;
-	} else {
-		kfree(s->history);
-		kfree(s);
-	}
-	return rc;
-
-}
-
-static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq = (struct seq_file *)file->private_data;
-	struct ext4_mb_proc_session *s = seq->private;
-	kfree(s->history);
-	kfree(s);
-	return seq_release(inode, file);
-}
-
-static ssize_t ext4_mb_seq_history_write(struct file *file,
-				const char __user *buffer,
-				size_t count, loff_t *ppos)
-{
-	struct seq_file *seq = (struct seq_file *)file->private_data;
-	struct ext4_mb_proc_session *s = seq->private;
-	struct super_block *sb = s->sb;
-	char str[32];
-	int value;
-
-	if (count >= sizeof(str)) {
-		printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
-				"mb_history", (int)sizeof(str));
-		return -EOVERFLOW;
-	}
-
-	if (copy_from_user(str, buffer, count))
-		return -EFAULT;
-
-	value = simple_strtol(str, NULL, 0);
-	if (value < 0)
-		return -ERANGE;
-	EXT4_SB(sb)->s_mb_history_filter = value;
-
-	return count;
-}
-
-static struct file_operations ext4_mb_seq_history_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ext4_mb_seq_history_open,
-	.read		= seq_read,
-	.write		= ext4_mb_seq_history_write,
-	.llseek		= seq_lseek,
-	.release	= ext4_mb_seq_history_release,
-};
-
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
 	struct super_block *sb = seq->private;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t group;
 
-	if (*pos < 0 || *pos >= sbi->s_groups_count)
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
 		return NULL;
-
 	group = *pos + 1;
-	return (void *) group;
+	return (void *) ((unsigned long) group);
 }
 
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct super_block *sb = seq->private;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t group;
 
 	++*pos;
-	if (*pos < 0 || *pos >= sbi->s_groups_count)
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
 		return NULL;
 	group = *pos + 1;
-	return (void *) group;;
+	return (void *) ((unsigned long) group);
 }
 
 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 {
 	struct super_block *sb = seq->private;
-	long group = (long) v;
+	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
 	int i;
-	int err;
+	int err, buddy_loaded = 0;
 	struct ext4_buddy e4b;
+	struct ext4_group_info *grinfo;
 	struct sg {
 		struct ext4_group_info info;
-		unsigned short counters[16];
+		ext4_grpblk_t counters[16];
 	} sg;
 
 	group--;
@@ -2380,17 +2264,23 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 
 	i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
 		sizeof(struct ext4_group_info);
-	err = ext4_mb_load_buddy(sb, group, &e4b);
-	if (err) {
-		seq_printf(seq, "#%-5lu: I/O error\n", group);
-		return 0;
+	grinfo = ext4_get_group_info(sb, group);
+	/* Load the group info in memory only if not already loaded. */
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
+		err = ext4_mb_load_buddy(sb, group, &e4b);
+		if (err) {
+			seq_printf(seq, "#%-5u: I/O error\n", group);
+			return 0;
+		}
+		buddy_loaded = 1;
 	}
-	ext4_lock_group(sb, group);
+
 	memcpy(&sg, ext4_get_group_info(sb, group), i);
-	ext4_unlock_group(sb, group);
-	ext4_mb_release_desc(&e4b);
 
-	seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+	if (buddy_loaded)
+		ext4_mb_unload_buddy(&e4b);
+
+	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
 			sg.info.bb_fragments, sg.info.bb_first_free);
 	for (i = 0; i <= 13; i++)
 		seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2404,7 +2294,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
 {
 }
 
-static struct seq_operations ext4_mb_seq_groups_ops = {
+static const struct seq_operations ext4_mb_seq_groups_ops = {
 	.start  = ext4_mb_seq_groups_start,
 	.next   = ext4_mb_seq_groups_next,
 	.stop   = ext4_mb_seq_groups_stop,
@@ -2413,19 +2303,19 @@ static struct seq_operations ext4_mb_seq_groups_ops = {
 
 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
 {
-	struct super_block *sb = PDE(inode)->data;
+	struct super_block *sb = PDE_DATA(inode);
 	int rc;
 
 	rc = seq_open(file, &ext4_mb_seq_groups_ops);
 	if (rc == 0) {
-		struct seq_file *m = (struct seq_file *)file->private_data;
+		struct seq_file *m = file->private_data;
 		m->private = sb;
 	}
 	return rc;
 
 }
 
-static struct file_operations ext4_mb_seq_groups_fops = {
+static const struct file_operations ext4_mb_seq_groups_fops = {
 	.owner		= THIS_MODULE,
 	.open		= ext4_mb_seq_groups_open,
 	.read		= seq_read,
@@ -2433,229 +2323,257 @@ static struct file_operations ext4_mb_seq_groups_fops = {
 	.release	= seq_release,
 };
 
-static void ext4_mb_history_release(struct super_block *sb)
+static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
+{
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
+
+	BUG_ON(!cachep);
+	return cachep;
+}
+
+/*
+ * Allocate the top-level s_group_info array for the specified number
+ * of groups
+ */
+int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned size;
+	struct ext4_group_info ***new_groupinfo;
 
-	remove_proc_entry("mb_groups", sbi->s_mb_proc);
-	remove_proc_entry("mb_history", sbi->s_mb_proc);
+	size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
+		EXT4_DESC_PER_BLOCK_BITS(sb);
+	if (size <= sbi->s_group_info_size)
+		return 0;
 
-	kfree(sbi->s_mb_history);
+	size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
+	new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+	if (!new_groupinfo) {
+		ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
+		return -ENOMEM;
+	}
+	if (sbi->s_group_info) {
+		memcpy(new_groupinfo, sbi->s_group_info,
+		       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
+		ext4_kvfree(sbi->s_group_info);
+	}
+	sbi->s_group_info = new_groupinfo;
+	sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+	ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
+		   sbi->s_group_info_size);
+	return 0;
 }
 
-static void ext4_mb_history_init(struct super_block *sb)
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+			  struct ext4_group_desc *desc)
 {
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int i;
+	int metalen = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info **meta_group_info;
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
-	if (sbi->s_mb_proc != NULL) {
-		struct proc_dir_entry *p;
-		p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc);
-		if (p) {
-			p->proc_fops = &ext4_mb_seq_history_fops;
-			p->data = sb;
-		}
-		p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc);
-		if (p) {
-			p->proc_fops = &ext4_mb_seq_groups_fops;
-			p->data = sb;
+	/*
+	 * First check if this group is the first of a reserved block.
+	 * If it's true, we have to allocate a new table of pointers
+	 * to ext4_group_info structures
+	 */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+		metalen = sizeof(*meta_group_info) <<
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		meta_group_info = kmalloc(metalen, GFP_KERNEL);
+		if (meta_group_info == NULL) {
+			ext4_msg(sb, KERN_ERR, "can't allocate mem "
+				 "for a buddy group");
+			goto exit_meta_group_info;
 		}
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+			meta_group_info;
 	}
 
-	sbi->s_mb_history_max = 1000;
-	sbi->s_mb_history_cur = 0;
-	spin_lock_init(&sbi->s_mb_history_lock);
-	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
-	if (likely(sbi->s_mb_history != NULL))
-		memset(sbi->s_mb_history, 0, i);
-	/* if we can't allocate history, then we simple won't use it */
-}
+	meta_group_info =
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 
-static void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
-	struct ext4_mb_history h;
+	meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
+	if (meta_group_info[i] == NULL) {
+		ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
+		goto exit_group_info;
+	}
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+		&(meta_group_info[i]->bb_state));
 
-	if (unlikely(sbi->s_mb_history == NULL))
-		return;
+	/*
+	 * initialize bb_free to be able to skip
+	 * empty groups without initialization
+	 */
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		meta_group_info[i]->bb_free =
+			ext4_free_clusters_after_init(sb, group, desc);
+	} else {
+		meta_group_info[i]->bb_free =
+			ext4_free_group_clusters(sb, desc);
+	}
 
-	if (!(ac->ac_op & sbi->s_mb_history_filter))
-		return;
+	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+	init_rwsem(&meta_group_info[i]->alloc_sem);
+	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 
-	h.op = ac->ac_op;
-	h.pid = current->pid;
-	h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
-	h.orig = ac->ac_o_ex;
-	h.result = ac->ac_b_ex;
-	h.flags = ac->ac_flags;
-	h.found = ac->ac_found;
-	h.groups = ac->ac_groups_scanned;
-	h.cr = ac->ac_criteria;
-	h.tail = ac->ac_tail;
-	h.buddy = ac->ac_buddy;
-	h.merged = 0;
-	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
-		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
-				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
-			h.merged = 1;
-		h.goal = ac->ac_g_ex;
-		h.result = ac->ac_f_ex;
+#ifdef DOUBLE_CHECK
+	{
+		struct buffer_head *bh;
+		meta_group_info[i]->bb_bitmap =
+			kmalloc(sb->s_blocksize, GFP_KERNEL);
+		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+		bh = ext4_read_block_bitmap(sb, group);
+		BUG_ON(bh == NULL);
+		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+			sb->s_blocksize);
+		put_bh(bh);
 	}
+#endif
 
-	spin_lock(&sbi->s_mb_history_lock);
-	memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
-	if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
-		sbi->s_mb_history_cur = 0;
-	spin_unlock(&sbi->s_mb_history_lock);
-}
+	return 0;
 
-#else
-#define ext4_mb_history_release(sb)
-#define ext4_mb_history_init(sb)
-#endif
+exit_group_info:
+	/* If a meta_group_info table has been allocated, release it now */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+	}
+exit_meta_group_info:
+	return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
 
 static int ext4_mb_init_backend(struct super_block *sb)
 {
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
-	int j, len, metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int num_meta_group_infos =
-		(sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
-			EXT4_DESC_PER_BLOCK_BITS(sb);
-	struct ext4_group_info **meta_group_info;
+	int err;
+	struct ext4_group_desc *desc;
+	struct kmem_cache *cachep;
+
+	err = ext4_mb_alloc_groupinfo(sb, ngroups);
+	if (err)
+		return err;
 
-	/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
-	 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
-	 * So a two level scheme suffices for now. */
-	sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
-				    num_meta_group_infos, GFP_KERNEL);
-	if (sbi->s_group_info == NULL) {
-		printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
-		return -ENOMEM;
-	}
 	sbi->s_buddy_cache = new_inode(sb);
 	if (sbi->s_buddy_cache == NULL) {
-		printk(KERN_ERR "EXT4-fs: can't get new inode\n");
+		ext4_msg(sb, KERN_ERR, "can't get new inode");
 		goto err_freesgi;
 	}
+	/* To avoid potentially colliding with an valid on-disk inode number,
+	 * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
+	 * not in the inode hash, so it should never be found by iget(), but
+	 * this will avoid confusion if it ever shows up during debugging. */
+	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
 	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-
-	metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
-	for (i = 0; i < num_meta_group_infos; i++) {
-		if ((i + 1) == num_meta_group_infos)
-			metalen = sizeof(*meta_group_info) *
-				(sbi->s_groups_count -
-					(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
-		meta_group_info = kmalloc(metalen, GFP_KERNEL);
-		if (meta_group_info == NULL) {
-			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
-			       "buddy group\n");
-			goto err_freemeta;
-		}
-		sbi->s_group_info[i] = meta_group_info;
-	}
-
-	/*
-	 * calculate needed size. if change bb_counters size,
-	 * don't forget about ext4_mb_generate_buddy()
-	 */
-	len = sizeof(struct ext4_group_info);
-	len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
-	for (i = 0; i < sbi->s_groups_count; i++) {
-		struct ext4_group_desc *desc;
-
-		meta_group_info =
-			sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
-		j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
-		meta_group_info[j] = kzalloc(len, GFP_KERNEL);
-		if (meta_group_info[j] == NULL) {
-			printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
-			i--;
-			goto err_freebuddy;
-		}
+	for (i = 0; i < ngroups; i++) {
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
-			printk(KERN_ERR
-				"EXT4-fs: can't read descriptor %lu\n", i);
+			ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
 			goto err_freebuddy;
 		}
-		memset(meta_group_info[j], 0, len);
-		set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
-			&(meta_group_info[j]->bb_state));
-
-		/*
-		 * initialize bb_free to be able to skip
-		 * empty groups without initialization
-		 */
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			meta_group_info[j]->bb_free =
-				ext4_free_blocks_after_init(sb, i, desc);
-		} else {
-			meta_group_info[j]->bb_free =
-				le16_to_cpu(desc->bg_free_blocks_count);
-		}
-
-		INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
-		{
-			struct buffer_head *bh;
-			meta_group_info[j]->bb_bitmap =
-				kmalloc(sb->s_blocksize, GFP_KERNEL);
-			BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
-			bh = read_block_bitmap(sb, i);
-			BUG_ON(bh == NULL);
-			memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
-					sb->s_blocksize);
-			put_bh(bh);
-		}
-#endif
-
+		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+			goto err_freebuddy;
 	}
 
 	return 0;
 
 err_freebuddy:
-	while (i >= 0) {
-		kfree(ext4_get_group_info(sb, i));
-		i--;
-	}
-	i = num_meta_group_infos;
-err_freemeta:
-	while (--i >= 0)
+	cachep = get_groupinfo_cache(sb->s_blocksize_bits);
+	while (i-- > 0)
+		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
+	i = sbi->s_group_info_size;
+	while (i-- > 0)
 		kfree(sbi->s_group_info[i]);
 	iput(sbi->s_buddy_cache);
 err_freesgi:
-	kfree(sbi->s_group_info);
+	ext4_kvfree(sbi->s_group_info);
 	return -ENOMEM;
 }
 
-int ext4_mb_init(struct super_block *sb, int needs_recovery)
+static void ext4_groupinfo_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+		if (ext4_groupinfo_caches[i])
+			kmem_cache_destroy(ext4_groupinfo_caches[i]);
+		ext4_groupinfo_caches[i] = NULL;
+	}
+}
+
+static int ext4_groupinfo_create_slab(size_t size)
+{
+	static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+	int slab_size;
+	int blocksize_bits = order_base_2(size);
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep;
+
+	if (cache_index >= NR_GRPINFO_CACHES)
+		return -EINVAL;
+
+	if (unlikely(cache_index < 0))
+		cache_index = 0;
+
+	mutex_lock(&ext4_grpinfo_slab_create_mutex);
+	if (ext4_groupinfo_caches[cache_index]) {
+		mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+		return 0;	/* Already created */
+	}
+
+	slab_size = offsetof(struct ext4_group_info,
+				bb_counters[blocksize_bits + 2]);
+
+	cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+					slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+					NULL);
+
+	ext4_groupinfo_caches[cache_index] = cachep;
+
+	mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+	if (!cachep) {
+		printk(KERN_EMERG
+		       "EXT4-fs: no memory for groupinfo slab cache\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+int ext4_mb_init(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	unsigned i;
+	unsigned i, j;
 	unsigned offset;
 	unsigned max;
+	int ret;
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
-	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
 
 	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_offsets == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
-		kfree(sbi->s_mb_maxs);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
+	ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+	if (ret < 0)
+		goto out;
+
 	/* order 0 is regular bitmap */
 	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
 	sbi->s_mb_offsets[0] = 0;
@@ -2671,19 +2589,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		i++;
 	} while (i <= sb->s_blocksize_bits + 1);
 
-	/* init file for buddy data */
-	i = ext4_mb_init_backend(sb);
-	if (i) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
-		kfree(sbi->s_mb_offsets);
-		kfree(sbi->s_mb_maxs);
-		return i;
-	}
-
 	spin_lock_init(&sbi->s_md_lock);
-	INIT_LIST_HEAD(&sbi->s_active_transaction);
-	INIT_LIST_HEAD(&sbi->s_closed_transaction);
-	INIT_LIST_HEAD(&sbi->s_committed_transaction);
 	spin_lock_init(&sbi->s_bal_lock);
 
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2691,33 +2597,70 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	sbi->s_mb_stats = MB_DEFAULT_STATS;
 	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
 	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
-	sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
-	sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
+	/*
+	 * The default group preallocation is 512, which for 4k block
+	 * sizes translates to 2 megabytes.  However for bigalloc file
+	 * systems, this is probably too big (i.e, if the cluster size
+	 * is 1 megabyte, then group preallocation size becomes half a
+	 * gigabyte!).  As a default, we will keep a two megabyte
+	 * group pralloc size for cluster sizes up to 64k, and after
+	 * that, we will force a minimum group preallocation size of
+	 * 32 clusters.  This translates to 8 megs when the cluster
+	 * size is 256k, and 32 megs when the cluster size is 1 meg,
+	 * which seems reasonable as a default.
+	 */
+	sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
+				       sbi->s_cluster_bits, 32);
+	/*
+	 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
+	 * to the lowest multiple of s_stripe which is bigger than
+	 * the s_mb_group_prealloc as determined above. We want
+	 * the preallocation size to be an exact multiple of the
+	 * RAID stripe size so that preallocations don't fragment
+	 * the stripes.
+	 */
+	if (sbi->s_stripe > 1) {
+		sbi->s_mb_group_prealloc = roundup(
+			sbi->s_mb_group_prealloc, sbi->s_stripe);
+	}
 
-	i = sizeof(struct ext4_locality_group) * NR_CPUS;
-	sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
 	if (sbi->s_locality_groups == NULL) {
-		clear_opt(sbi->s_mount_opt, MBALLOC);
-		kfree(sbi->s_mb_offsets);
-		kfree(sbi->s_mb_maxs);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_possible_cpu(i) {
 		struct ext4_locality_group *lg;
-		lg = &sbi->s_locality_groups[i];
+		lg = per_cpu_ptr(sbi->s_locality_groups, i);
 		mutex_init(&lg->lg_mutex);
-		INIT_LIST_HEAD(&lg->lg_prealloc_list);
+		for (j = 0; j < PREALLOC_TB_SIZE; j++)
+			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
 		spin_lock_init(&lg->lg_prealloc_lock);
 	}
 
-	ext4_mb_init_per_dev_proc(sb);
-	ext4_mb_history_init(sb);
+	/* init file for buddy data */
+	ret = ext4_mb_init_backend(sb);
+	if (ret != 0)
+		goto out_free_locality_groups;
+
+	if (sbi->s_proc)
+		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+				 &ext4_mb_seq_groups_fops, sb);
 
-	printk("EXT4-fs: mballoc enabled\n");
 	return 0;
+
+out_free_locality_groups:
+	free_percpu(sbi->s_locality_groups);
+	sbi->s_locality_groups = NULL;
+out:
+	kfree(sbi->s_mb_offsets);
+	sbi->s_mb_offsets = NULL;
+	kfree(sbi->s_mb_maxs);
+	sbi->s_mb_maxs = NULL;
+	return ret;
 }
 
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 {
 	struct ext4_prealloc_space *pa;
@@ -2728,34 +2671,27 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 		list_del(&pa->pa_group_list);
 		count++;
-		kfree(pa);
+		kmem_cache_free(ext4_pspace_cachep, pa);
 	}
 	if (count)
-		mb_debug("mballoc: %u PAs left\n", count);
+		mb_debug(1, "mballoc: %u PAs left\n", count);
 
 }
 
 int ext4_mb_release(struct super_block *sb)
 {
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
 	int num_meta_group_infos;
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
-	if (!test_opt(sb, MBALLOC))
-		return 0;
-
-	/* release freed, non-committed blocks */
-	spin_lock(&sbi->s_md_lock);
-	list_splice_init(&sbi->s_closed_transaction,
-			&sbi->s_committed_transaction);
-	list_splice_init(&sbi->s_active_transaction,
-			&sbi->s_committed_transaction);
-	spin_unlock(&sbi->s_md_lock);
-	ext4_mb_free_committed_blocks(sb);
+	if (sbi->s_proc)
+		remove_proc_entry("mb_groups", sbi->s_proc);
 
 	if (sbi->s_group_info) {
-		for (i = 0; i < sbi->s_groups_count; i++) {
+		for (i = 0; i < ngroups; i++) {
 			grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
 			kfree(grinfo->bb_bitmap);
@@ -2763,299 +2699,192 @@ int ext4_mb_release(struct super_block *sb)
 			ext4_lock_group(sb, i);
 			ext4_mb_cleanup_pa(grinfo);
 			ext4_unlock_group(sb, i);
-			kfree(grinfo);
+			kmem_cache_free(cachep, grinfo);
 		}
-		num_meta_group_infos = (sbi->s_groups_count +
+		num_meta_group_infos = (ngroups +
 				EXT4_DESC_PER_BLOCK(sb) - 1) >>
 			EXT4_DESC_PER_BLOCK_BITS(sb);
 		for (i = 0; i < num_meta_group_infos; i++)
 			kfree(sbi->s_group_info[i]);
-		kfree(sbi->s_group_info);
+		ext4_kvfree(sbi->s_group_info);
 	}
 	kfree(sbi->s_mb_offsets);
 	kfree(sbi->s_mb_maxs);
 	if (sbi->s_buddy_cache)
 		iput(sbi->s_buddy_cache);
 	if (sbi->s_mb_stats) {
-		printk(KERN_INFO
-		       "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
+		ext4_msg(sb, KERN_INFO,
+		       "mballoc: %u blocks %u reqs (%u success)",
 				atomic_read(&sbi->s_bal_allocated),
 				atomic_read(&sbi->s_bal_reqs),
 				atomic_read(&sbi->s_bal_success));
-		printk(KERN_INFO
-		      "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
-				"%u 2^N hits, %u breaks, %u lost\n",
+		ext4_msg(sb, KERN_INFO,
+		      "mballoc: %u extents scanned, %u goal hits, "
+				"%u 2^N hits, %u breaks, %u lost",
 				atomic_read(&sbi->s_bal_ex_scanned),
 				atomic_read(&sbi->s_bal_goals),
 				atomic_read(&sbi->s_bal_2orders),
 				atomic_read(&sbi->s_bal_breaks),
 				atomic_read(&sbi->s_mb_lost_chunks));
-		printk(KERN_INFO
-		       "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
-				sbi->s_mb_buddies_generated++,
+		ext4_msg(sb, KERN_INFO,
+		       "mballoc: %lu generated and it took %Lu",
+				sbi->s_mb_buddies_generated,
 				sbi->s_mb_generation_time);
-		printk(KERN_INFO
-		       "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
+		ext4_msg(sb, KERN_INFO,
+		       "mballoc: %u preallocated, %u discarded",
 				atomic_read(&sbi->s_mb_preallocated),
 				atomic_read(&sbi->s_mb_discarded));
 	}
 
-	kfree(sbi->s_locality_groups);
-
-	ext4_mb_history_release(sb);
-	ext4_mb_destroy_per_dev_proc(sb);
+	free_percpu(sbi->s_locality_groups);
 
 	return 0;
 }
 
-static void ext4_mb_free_committed_blocks(struct super_block *sb)
+static inline int ext4_issue_discard(struct super_block *sb,
+		ext4_group_t block_group, ext4_grpblk_t cluster, int count)
 {
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int err;
-	int i;
-	int count = 0;
-	int count2 = 0;
-	struct ext4_free_metadata *md;
-	struct ext4_buddy e4b;
-
-	if (list_empty(&sbi->s_committed_transaction))
-		return;
-
-	/* there is committed blocks to be freed yet */
-	do {
-		/* get next array of blocks */
-		md = NULL;
-		spin_lock(&sbi->s_md_lock);
-		if (!list_empty(&sbi->s_committed_transaction)) {
-			md = list_entry(sbi->s_committed_transaction.next,
-					struct ext4_free_metadata, list);
-			list_del(&md->list);
-		}
-		spin_unlock(&sbi->s_md_lock);
-
-		if (md == NULL)
-			break;
-
-		mb_debug("gonna free %u blocks in group %lu (0x%p):",
-				md->num, md->group, md);
-
-		err = ext4_mb_load_buddy(sb, md->group, &e4b);
-		/* we expect to find existing buddy because it's pinned */
-		BUG_ON(err != 0);
-
-		/* there are blocks to put in buddy to make them really free */
-		count += md->num;
-		count2++;
-		ext4_lock_group(sb, md->group);
-		for (i = 0; i < md->num; i++) {
-			mb_debug(" %u", md->blocks[i]);
-			err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
-			BUG_ON(err != 0);
-		}
-		mb_debug("\n");
-		ext4_unlock_group(sb, md->group);
-
-		/* balance refcounts from ext4_mb_free_metadata() */
-		page_cache_release(e4b.bd_buddy_page);
-		page_cache_release(e4b.bd_bitmap_page);
-
-		kfree(md);
-		ext4_mb_release_desc(&e4b);
-
-	} while (md);
-
-	mb_debug("freed %u blocks in %u structures\n", count, count2);
-}
-
-#define EXT4_ROOT			"ext4"
-#define EXT4_MB_STATS_NAME		"stats"
-#define EXT4_MB_MAX_TO_SCAN_NAME	"max_to_scan"
-#define EXT4_MB_MIN_TO_SCAN_NAME	"min_to_scan"
-#define EXT4_MB_ORDER2_REQ		"order2_req"
-#define EXT4_MB_STREAM_REQ		"stream_req"
-#define EXT4_MB_GROUP_PREALLOC		"group_prealloc"
-
-
-
-#define MB_PROC_VALUE_READ(name)				\
-static int ext4_mb_read_##name(char *page, char **start,	\
-		off_t off, int count, int *eof, void *data)	\
-{								\
-	struct ext4_sb_info *sbi = data;			\
-	int len;						\
-	*eof = 1;						\
-	if (off != 0)						\
-		return 0;					\
-	len = sprintf(page, "%ld\n", sbi->s_mb_##name);		\
-	*start = page;						\
-	return len;						\
+	ext4_fsblk_t discard_block;
+
+	discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
+			 ext4_group_first_block_no(sb, block_group));
+	count = EXT4_C2B(EXT4_SB(sb), count);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long) discard_block, count);
+	return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
 }
 
-#define MB_PROC_VALUE_WRITE(name)				\
-static int ext4_mb_write_##name(struct file *file,		\
-		const char __user *buf, unsigned long cnt, void *data)	\
-{								\
-	struct ext4_sb_info *sbi = data;			\
-	char str[32];						\
-	long value;						\
-	if (cnt >= sizeof(str))					\
-		return -EINVAL;					\
-	if (copy_from_user(str, buf, cnt))			\
-		return -EFAULT;					\
-	value = simple_strtol(str, NULL, 0);			\
-	if (value <= 0)						\
-		return -ERANGE;					\
-	sbi->s_mb_##name = value;				\
-	return cnt;						\
-}
-
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
-
-#define	MB_PROC_HANDLER(name, var)					\
-do {									\
-	proc = create_proc_entry(name, mode, sbi->s_mb_proc);		\
-	if (proc == NULL) {						\
-		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
-		goto err_out;						\
-	}								\
-	proc->data = sbi;						\
-	proc->read_proc  = ext4_mb_read_##var ;				\
-	proc->write_proc = ext4_mb_write_##var;				\
-} while (0)
-
-static int ext4_mb_init_per_dev_proc(struct super_block *sb)
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void ext4_free_data_callback(struct super_block *sb,
+				    struct ext4_journal_cb_entry *jce,
+				    int rc)
 {
-	mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct proc_dir_entry *proc;
-	char devname[64];
+	struct ext4_free_data *entry = (struct ext4_free_data *)jce;
+	struct ext4_buddy e4b;
+	struct ext4_group_info *db;
+	int err, count = 0, count2 = 0;
+
+	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+		 entry->efd_count, entry->efd_group, entry);
+
+	if (test_opt(sb, DISCARD)) {
+		err = ext4_issue_discard(sb, entry->efd_group,
+					 entry->efd_start_cluster,
+					 entry->efd_count);
+		if (err && err != -EOPNOTSUPP)
+			ext4_msg(sb, KERN_WARNING, "discard request in"
+				 " group:%d block:%d count:%d failed"
+				 " with %d", entry->efd_group,
+				 entry->efd_start_cluster,
+				 entry->efd_count, err);
+	}
 
-	snprintf(devname, sizeof(devname) - 1, "%s",
-		bdevname(sb->s_bdev, devname));
-	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
+	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+	/* we expect to find existing buddy because it's pinned */
+	BUG_ON(err != 0);
 
-	MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
-	MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
-	MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
-	MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
-	MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
-	MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
 
-	return 0;
+	db = e4b.bd_info;
+	/* there are blocks to put in buddy to make them really free */
+	count += entry->efd_count;
+	count2++;
+	ext4_lock_group(sb, entry->efd_group);
+	/* Take it out of per group rb tree */
+	rb_erase(&entry->efd_node, &(db->bb_free_root));
+	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
 
-err_out:
-	printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-	remove_proc_entry(devname, proc_root_ext4);
-	sbi->s_mb_proc = NULL;
-
-	return -ENOMEM;
-}
-
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	char devname[64];
-
-	if (sbi->s_mb_proc == NULL)
-		return -EINVAL;
+	/*
+	 * Clear the trimmed flag for the group so that the next
+	 * ext4_trim_fs can trim it.
+	 * If the volume is mounted with -o discard, online discard
+	 * is supported and the free blocks will be trimmed online.
+	 */
+	if (!test_opt(sb, DISCARD))
+		EXT4_MB_GRP_CLEAR_TRIMMED(db);
 
-	snprintf(devname, sizeof(devname) - 1, "%s",
-		bdevname(sb->s_bdev, devname));
-	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
-	remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
-	remove_proc_entry(devname, proc_root_ext4);
+	if (!db->bb_free_root.rb_node) {
+		/* No more items in the per group rb tree
+		 * balance refcounts from ext4_mb_free_metadata()
+		 */
+		page_cache_release(e4b.bd_buddy_page);
+		page_cache_release(e4b.bd_bitmap_page);
+	}
+	ext4_unlock_group(sb, entry->efd_group);
+	kmem_cache_free(ext4_free_data_cachep, entry);
+	ext4_mb_unload_buddy(&e4b);
 
-	return 0;
+	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 }
 
-int __init init_ext4_mballoc(void)
+int __init ext4_init_mballoc(void)
 {
-	ext4_pspace_cachep =
-		kmem_cache_create("ext4_prealloc_space",
-				     sizeof(struct ext4_prealloc_space),
-				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
+					SLAB_RECLAIM_ACCOUNT);
 	if (ext4_pspace_cachep == NULL)
 		return -ENOMEM;
 
-	ext4_ac_cachep =
-		kmem_cache_create("ext4_alloc_context",
-				     sizeof(struct ext4_allocation_context),
-				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
+				    SLAB_RECLAIM_ACCOUNT);
 	if (ext4_ac_cachep == NULL) {
 		kmem_cache_destroy(ext4_pspace_cachep);
 		return -ENOMEM;
 	}
-#ifdef CONFIG_PROC_FS
-	proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
-	if (proc_root_ext4 == NULL)
-		printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
-#endif
+
+	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+					   SLAB_RECLAIM_ACCOUNT);
+	if (ext4_free_data_cachep == NULL) {
+		kmem_cache_destroy(ext4_pspace_cachep);
+		kmem_cache_destroy(ext4_ac_cachep);
+		return -ENOMEM;
+	}
 	return 0;
 }
 
-void exit_ext4_mballoc(void)
+void ext4_exit_mballoc(void)
 {
-	/* XXX: synchronize_rcu(); */
+	/*
+	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+	 * before destroying the slab cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
-#ifdef CONFIG_PROC_FS
-	remove_proc_entry(EXT4_ROOT, proc_root_fs);
-#endif
+	kmem_cache_destroy(ext4_free_data_cachep);
+	ext4_groupinfo_destroy_slabs();
 }
 
 
 /*
- * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps
+ * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
  * Returns 0 if success or error code
  */
-static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-				handle_t *handle)
+static noinline_for_stack int
+ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+				handle_t *handle, unsigned int reserv_clstrs)
 {
 	struct buffer_head *bitmap_bh = NULL;
-	struct ext4_super_block *es;
 	struct ext4_group_desc *gdp;
 	struct buffer_head *gdp_bh;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block;
-	int err;
+	int err, len;
 
 	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
 	BUG_ON(ac->ac_b_ex.fe_len <= 0);
 
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
-	es = sbi->s_es;
-
-	ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
-			gdp->bg_free_blocks_count);
 
 	err = -EIO;
-	bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
 	if (!bitmap_bh)
 		goto out_err;
 
+	BUFFER_TRACE(bitmap_bh, "getting write access");
 	err = ext4_journal_get_write_access(handle, bitmap_bh);
 	if (err)
 		goto out_err;
@@ -3065,23 +2894,35 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (!gdp)
 		goto out_err;
 
+	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
+			ext4_free_group_clusters(sb, gdp));
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, gdp_bh);
 	if (err)
 		goto out_err;
 
-	block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb)
-		+ ac->ac_b_ex.fe_start
-		+ le32_to_cpu(es->s_first_data_block);
-
-	if (block == ext4_block_bitmap(sb, gdp) ||
-			block == ext4_inode_bitmap(sb, gdp) ||
-			in_range(block, ext4_inode_table(sb, gdp),
-				EXT4_SB(sb)->s_itb_per_group)) {
+	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
 
-		ext4_error(sb, __FUNCTION__,
-			   "Allocating block in system zone - block = %llu",
-			   block);
+	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+	if (!ext4_data_block_valid(sbi, block, len)) {
+		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
+			   "fs metadata", block, block+len);
+		/* File system mounted not to panic on error
+		 * Fix the bitmap and repeat the block allocation
+		 * We leak some of the blocks here.
+		 */
+		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+		ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+			      ac->ac_b_ex.fe_len);
+		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		if (!err)
+			err = -EAGAIN;
+		goto out_err;
 	}
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
 	{
 		int i;
@@ -3091,40 +2932,52 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		}
 	}
 #endif
-	mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
-				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
-
-	spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+	ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+		      ac->ac_b_ex.fe_len);
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-		gdp->bg_free_blocks_count =
-			cpu_to_le16(ext4_free_blocks_after_init(sb,
-						ac->ac_b_ex.fe_group,
-						gdp));
-	}
-	gdp->bg_free_blocks_count =
-		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)
-				- ac->ac_b_ex.fe_len);
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
-
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+		ext4_free_group_clusters_set(sb, gdp,
+					     ext4_free_clusters_after_init(sb,
+						ac->ac_b_ex.fe_group, gdp));
+	}
+	len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
+	ext4_free_group_clusters_set(sb, gdp, len);
+	ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
+	ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
+
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+	percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
+	/*
+	 * Now reduce the dirty block count also. Should not go negative
+	 */
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+		/* release all the reserved blocks if non delalloc */
+		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+				   reserv_clstrs);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi,
+							  ac->ac_b_ex.fe_group);
+		atomic64_sub(ac->ac_b_ex.fe_len,
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+	}
+
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 	if (err)
 		goto out_err;
-	err = ext4_journal_dirty_metadata(handle, gdp_bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
 
 out_err:
-	sb->s_dirt = 1;
 	brelse(bitmap_bh);
 	return err;
 }
 
 /*
  * here we normalize request for locality group
- * Group request are normalized to s_strip size if we set the same via mount
- * option. If not we set it to s_mb_group_prealloc which can be configured via
- * /proc/fs/ext4/<partition>/group_prealloc
+ * Group request are normalized to s_mb_group_prealloc, which goes to
+ * s_strip if we set the same via mount option.
+ * s_mb_group_prealloc can be configured via
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
  *
  * XXX: should we try to preallocate more than the group has now?
  */
@@ -3134,11 +2987,8 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
 	struct ext4_locality_group *lg = ac->ac_lg;
 
 	BUG_ON(lg == NULL);
-	if (EXT4_SB(sb)->s_stripe)
-		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
-	else
-		ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
-	mb_debug("#%u: goal %lu blocks for locality group\n",
+	ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
+	mb_debug(1, "#%u: goal %u blocks for locality group\n",
 		current->pid, ac->ac_g_ex.fe_len);
 }
 
@@ -3146,15 +2996,18 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
  * Normalization means making request better in terms of
  * size and alignment
  */
-static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
+static noinline_for_stack void
+ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 				struct ext4_allocation_request *ar)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	int bsbits, max;
 	ext4_lblk_t end;
-	struct list_head *cur;
-	loff_t size, orig_size, start_off;
-	ext4_lblk_t start, orig_start;
+	loff_t size, start_off;
+	loff_t orig_size __maybe_unused;
+	ext4_lblk_t start;
 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+	struct ext4_prealloc_space *pa;
 
 	/* do normalize only data requests, metadata requests
 	   do not need preallocation */
@@ -3179,17 +3032,17 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 
 	/* first, let's learn actual file size
 	 * given current request is allocated */
-	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
 	size = size << bsbits;
 	if (size < i_size_read(ac->ac_inode))
 		size = i_size_read(ac->ac_inode);
+	orig_size = size;
 
-	/* max available blocks in a free group */
-	max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
-				EXT4_SB(ac->ac_sb)->s_itb_per_group;
+	/* max size of free chunks */
+	max = 2 << bsbits;
 
-#define NRL_CHECK_SIZE(req, size, max,bits)	\
-		(req <= (size) || max <= ((size) >> bits))
+#define NRL_CHECK_SIZE(req, size, max, chunk_size)	\
+		(req <= (size) || max <= (chunk_size))
 
 	/* first, try to predict filesize */
 	/* XXX: should this table be tunable? */
@@ -3208,16 +3061,16 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		size = 512 * 1024;
 	} else if (size <= 1024 * 1024) {
 		size = 1024 * 1024;
-	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
+	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
-						(20 - bsbits)) << 20;
-		size = 1024 * 1024;
-	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
+						(21 - bsbits)) << 21;
+		size = 2 * 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
 							(22 - bsbits)) << 22;
 		size = 4 * 1024 * 1024;
 	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
-					(8<<20)>>bsbits, max, bsbits)) {
+					(8<<20)>>bsbits, max, 8 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
 							(23 - bsbits)) << 23;
 		size = 8 * 1024 * 1024;
@@ -3225,8 +3078,8 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
 		size	  = ac->ac_o_ex.fe_len << bsbits;
 	}
-	orig_size = size = size >> bsbits;
-	orig_start = start = start_off >> bsbits;
+	size = size >> bsbits;
+	start = start_off >> bsbits;
 
 	/* don't cover already allocated blocks in selected range */
 	if (ar->pleft && start <= ar->lleft) {
@@ -3240,11 +3093,8 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 
 	/* check we don't cross already preallocated blocks */
 	rcu_read_lock();
-	list_for_each_rcu(cur, &ei->i_prealloc_list) {
-		struct ext4_prealloc_space *pa;
-		unsigned long pa_end;
-
-		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+		ext4_lblk_t pa_end;
 
 		if (pa->pa_deleted)
 			continue;
@@ -3254,29 +3104,25 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 			continue;
 		}
 
-		pa_end = pa->pa_lstart + pa->pa_len;
+		pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+						  pa->pa_len);
 
 		/* PA must not overlap original request */
 		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
 			ac->ac_o_ex.fe_logical < pa->pa_lstart));
 
-		/* skip PA normalized request doesn't overlap with */
-		if (pa->pa_lstart >= end) {
-			spin_unlock(&pa->pa_lock);
-			continue;
-		}
-		if (pa_end <= start) {
+		/* skip PAs this normalized request doesn't overlap with */
+		if (pa->pa_lstart >= end || pa_end <= start) {
 			spin_unlock(&pa->pa_lock);
 			continue;
 		}
 		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
 
+		/* adjust start or end to be adjacent to this pa */
 		if (pa_end <= ac->ac_o_ex.fe_logical) {
 			BUG_ON(pa_end < start);
 			start = pa_end;
-		}
-
-		if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
+		} else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
 			BUG_ON(pa->pa_lstart > end);
 			end = pa->pa_lstart;
 		}
@@ -3287,13 +3133,13 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 
 	/* XXX: extra loop to check we really don't overlap preallocations */
 	rcu_read_lock();
-	list_for_each_rcu(cur, &ei->i_prealloc_list) {
-		struct ext4_prealloc_space *pa;
-		unsigned long pa_end;
-		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+		ext4_lblk_t pa_end;
+
 		spin_lock(&pa->pa_lock);
 		if (pa->pa_deleted == 0) {
-			pa_end = pa->pa_lstart + pa->pa_len;
+			pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
+							  pa->pa_len);
 			BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
 		}
 		spin_unlock(&pa->pa_lock);
@@ -3302,20 +3148,21 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 
 	if (start + size <= ac->ac_o_ex.fe_logical &&
 			start > ac->ac_o_ex.fe_logical) {
-		printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
-			(unsigned long) start, (unsigned long) size,
-			(unsigned long) ac->ac_o_ex.fe_logical);
+		ext4_msg(ac->ac_sb, KERN_ERR,
+			 "start %lu, size %lu, fe_logical %lu",
+			 (unsigned long) start, (unsigned long) size,
+			 (unsigned long) ac->ac_o_ex.fe_logical);
 	}
 	BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
 			start > ac->ac_o_ex.fe_logical);
-	BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
 
 	/* now prepare goal request */
 
 	/* XXX: is it better to align blocks WRT to logical
 	 * placement or satisfy big request as is */
 	ac->ac_g_ex.fe_logical = start;
-	ac->ac_g_ex.fe_len = size;
+	ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
 
 	/* define goal start in order to merge */
 	if (ar->pright && (ar->lright == (start + size))) {
@@ -3333,7 +3180,7 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
 	}
 
-	mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+	mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
 		(unsigned) orig_size, (unsigned) start);
 }
 
@@ -3344,7 +3191,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
 		atomic_inc(&sbi->s_bal_reqs);
 		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
-		if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
 			atomic_inc(&sbi->s_bal_success);
 		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
 		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3354,7 +3201,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 			atomic_inc(&sbi->s_bal_breaks);
 	}
 
-	ext4_mb_store_history(ac);
+	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+		trace_ext4_mballoc_alloc(ac);
+	else
+		trace_ext4_mballoc_prealloc(ac);
+}
+
+/*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context.  We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+	struct ext4_prealloc_space *pa = ac->ac_pa;
+
+	if (pa && pa->pa_type == MB_INODE_PA)
+		pa->pa_free += ac->ac_b_ex.fe_len;
 }
 
 /*
@@ -3363,14 +3227,16 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 				struct ext4_prealloc_space *pa)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	ext4_fsblk_t start;
 	ext4_fsblk_t end;
 	int len;
 
 	/* found preallocated blocks, use them */
 	start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
-	end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
-	len = end - start;
+	end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
+		  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
+	len = EXT4_NUM_B2C(sbi, end - start);
 	ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
 					&ac->ac_b_ex.fe_start);
 	ac->ac_b_ex.fe_len = len;
@@ -3378,11 +3244,11 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 	ac->ac_pa = pa;
 
 	BUG_ON(start < pa->pa_pstart);
-	BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
+	BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
 	BUG_ON(pa->pa_free < len);
 	pa->pa_free -= len;
 
-	mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa);
+	mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
 }
 
 /*
@@ -3391,7 +3257,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 				struct ext4_prealloc_space *pa)
 {
-	unsigned len = ac->ac_o_ex.fe_len;
+	unsigned int len = ac->ac_o_ex.fe_len;
 
 	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
 					&ac->ac_b_ex.fe_group,
@@ -3406,18 +3272,50 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 	 * in on-disk bitmap -- see ext4_mb_release_context()
 	 * Other CPUs are prevented from allocating from this pa by lg_mutex
 	 */
-	mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+	mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+}
+
+/*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+			struct ext4_prealloc_space *pa,
+			struct ext4_prealloc_space *cpa)
+{
+	ext4_fsblk_t cur_distance, new_distance;
+
+	if (cpa == NULL) {
+		atomic_inc(&pa->pa_count);
+		return pa;
+	}
+	cur_distance = abs(goal_block - cpa->pa_pstart);
+	new_distance = abs(goal_block - pa->pa_pstart);
+
+	if (cur_distance <= new_distance)
+		return cpa;
+
+	/* drop the previous reference */
+	atomic_dec(&cpa->pa_count);
+	atomic_inc(&pa->pa_count);
+	return pa;
 }
 
 /*
  * search goal blocks in preallocated space
  */
-static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	int order, i;
 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
 	struct ext4_locality_group *lg;
-	struct ext4_prealloc_space *pa;
-	struct list_head *cur;
+	struct ext4_prealloc_space *pa, *cpa = NULL;
+	ext4_fsblk_t goal_block;
 
 	/* only data can be preallocated */
 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3425,13 +3323,19 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 
 	/* first, try per-file preallocation */
 	rcu_read_lock();
-	list_for_each_rcu(cur, &ei->i_prealloc_list) {
-		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
+	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
 
 		/* all fields in this condition don't change,
 		 * so we can skip locking for them */
 		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
-			ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+		    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
+					       EXT4_C2B(sbi, pa->pa_len)))
+			continue;
+
+		/* non-extent files can't have physical blocks past 2^32 */
+		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+		    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
+		     EXT4_MAX_BLOCK_FILE_PHYS))
 			continue;
 
 		/* found preallocated blocks, use them */
@@ -3456,32 +3360,70 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 	lg = ac->ac_lg;
 	if (lg == NULL)
 		return 0;
+	order  = fls(ac->ac_o_ex.fe_len) - 1;
+	if (order > PREALLOC_TB_SIZE - 1)
+		/* The max size of hash table is PREALLOC_TB_SIZE */
+		order = PREALLOC_TB_SIZE - 1;
 
-	rcu_read_lock();
-	list_for_each_rcu(cur, &lg->lg_prealloc_list) {
-		pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list);
-		spin_lock(&pa->pa_lock);
-		if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
-			atomic_inc(&pa->pa_count);
-			ext4_mb_use_group_pa(ac, pa);
+	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
+	/*
+	 * search for the prealloc space that is having
+	 * minimal distance from the goal block.
+	 */
+	for (i = order; i < PREALLOC_TB_SIZE; i++) {
+		rcu_read_lock();
+		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
+					pa_inode_list) {
+			spin_lock(&pa->pa_lock);
+			if (pa->pa_deleted == 0 &&
+					pa->pa_free >= ac->ac_o_ex.fe_len) {
+
+				cpa = ext4_mb_check_group_pa(goal_block,
+								pa, cpa);
+			}
 			spin_unlock(&pa->pa_lock);
-			ac->ac_criteria = 20;
-			rcu_read_unlock();
-			return 1;
 		}
-		spin_unlock(&pa->pa_lock);
+		rcu_read_unlock();
+	}
+	if (cpa) {
+		ext4_mb_use_group_pa(ac, cpa);
+		ac->ac_criteria = 20;
+		return 1;
 	}
-	rcu_read_unlock();
-
 	return 0;
 }
 
 /*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with the ext4 group lock held
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group)
+{
+	struct rb_node *n;
+	struct ext4_group_info *grp;
+	struct ext4_free_data *entry;
+
+	grp = ext4_get_group_info(sb, group);
+	n = rb_first(&(grp->bb_free_root));
+
+	while (n) {
+		entry = rb_entry(n, struct ext4_free_data, efd_node);
+		ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
+		n = rb_next(n);
+	}
+	return;
+}
+
+/*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
  */
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+static noinline_for_stack
+void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -3490,7 +3432,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 	ext4_group_t groupnr;
 	ext4_grpblk_t start;
 	int preallocated = 0;
-	int count = 0;
 	int len;
 
 	/* all form of preallocation discards first load group,
@@ -3511,18 +3452,19 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 		if (unlikely(len == 0))
 			continue;
 		BUG_ON(groupnr != group);
-		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
-						bitmap, start, len);
+		ext4_set_bits(bitmap, start, len);
 		preallocated += len;
-		count++;
 	}
-	mb_debug("prellocated %u for group %lu\n", preallocated, group);
+	mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
 }
 
 static void ext4_mb_pa_callback(struct rcu_head *head)
 {
 	struct ext4_prealloc_space *pa;
 	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+	BUG_ON(atomic_read(&pa->pa_count));
+	BUG_ON(pa->pa_deleted == 0);
 	kmem_cache_free(ext4_pspace_cachep, pa);
 }
 
@@ -3533,13 +3475,16 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 			struct super_block *sb, struct ext4_prealloc_space *pa)
 {
-	unsigned long grp;
-
-	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
-		return;
+	ext4_group_t grp;
+	ext4_fsblk_t grp_blk;
 
 	/* in this short window concurrent discard can set pa_deleted */
 	spin_lock(&pa->pa_lock);
+	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+		spin_unlock(&pa->pa_lock);
+		return;
+	}
+
 	if (pa->pa_deleted == 1) {
 		spin_unlock(&pa->pa_lock);
 		return;
@@ -3548,8 +3493,15 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	pa->pa_deleted = 1;
 	spin_unlock(&pa->pa_lock);
 
-	/* -1 is to protect from crossing allocation group */
-	ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
+	grp_blk = pa->pa_pstart;
+	/*
+	 * If doing group-based preallocation, pa_pstart may be in the
+	 * next group when pa is used up
+	 */
+	if (pa->pa_type == MB_GROUP_PA)
+		grp_blk--;
+
+	grp = ext4_get_group_number(sb, grp_blk);
 
 	/*
 	 * possible race:
@@ -3579,9 +3531,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 /*
  * creates new preallocated space for given inode
  */
-static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_prealloc_space *pa;
 	struct ext4_group_info *grp;
 	struct ext4_inode_info *ei;
@@ -3613,16 +3567,18 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
 
 		/* also, we should cover whole original request */
-		wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
+		wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
 
 		/* the smallest one defines real window */
 		win = min(winl, wins);
 
-		offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
+		offs = ac->ac_o_ex.fe_logical %
+			EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
 		if (offs && offs < win)
 			win = offs;
 
-		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
+		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
+			EXT4_NUM_B2C(sbi, win);
 		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
 		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
 	}
@@ -3637,14 +3593,17 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	pa->pa_free = pa->pa_len;
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
-	pa->pa_linear = 0;
+	pa->pa_type = MB_INODE_PA;
 
-	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+	mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_inode_pa(ac, pa);
 
 	ext4_mb_use_inode_pa(ac, pa);
-	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+	atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
 
 	ei = EXT4_I(ac->ac_inode);
 	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
@@ -3666,7 +3625,8 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 /*
  * creates new preallocated space for locality group inodes belongs to
  */
-static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
+static noinline_for_stack int
+ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
 	struct ext4_locality_group *lg;
@@ -3693,11 +3653,14 @@ static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	pa->pa_free = pa->pa_len;
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
-	pa->pa_linear = 1;
+	pa->pa_type = MB_GROUP_PA;
 
-	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+	mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_ext4_mb_new_group_pa(ac, pa);
 
 	ext4_mb_use_group_pa(ac, pa);
 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3713,10 +3676,10 @@ static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
 	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 
-	spin_lock(pa->pa_obj_lock);
-	list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
-	spin_unlock(pa->pa_obj_lock);
-
+	/*
+	 * We will later add the new pa to the right bucket
+	 * after updating the pa_free in ext4_mb_release_context
+	 */
 	return 0;
 }
 
@@ -3739,107 +3702,76 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
  * the caller MUST hold group/inode locks.
  * TODO: optimize the case when there are no in-core structures yet
  */
-static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
-				struct buffer_head *bitmap_bh,
-				struct ext4_prealloc_space *pa)
+static noinline_for_stack int
+ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
+			struct ext4_prealloc_space *pa)
 {
-	struct ext4_allocation_context *ac;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	unsigned long end;
-	unsigned long next;
+	unsigned int end;
+	unsigned int next;
 	ext4_group_t group;
 	ext4_grpblk_t bit;
-	sector_t start;
+	unsigned long long grp_blk_start;
 	int err = 0;
 	int free = 0;
 
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+	grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	end = bit + pa->pa_len;
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-
-	if (ac) {
-		ac->ac_sb = sb;
-		ac->ac_inode = pa->pa_inode;
-		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-	}
-
 	while (bit < end) {
 		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
 		if (bit >= end)
 			break;
 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-		if (next > end)
-			next = end;
-		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
-				le32_to_cpu(sbi->s_es->s_first_data_block);
-		mb_debug("    free preallocated %u/%u in group %u\n",
-				(unsigned) start, (unsigned) next - bit,
-				(unsigned) group);
+		mb_debug(1, "    free preallocated %u/%u in group %u\n",
+			 (unsigned) ext4_group_first_block_no(sb, group) + bit,
+			 (unsigned) next - bit, (unsigned) group);
 		free += next - bit;
 
-		if (ac) {
-			ac->ac_b_ex.fe_group = group;
-			ac->ac_b_ex.fe_start = bit;
-			ac->ac_b_ex.fe_len = next - bit;
-			ac->ac_b_ex.fe_logical = 0;
-			ext4_mb_store_history(ac);
-		}
-
+		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
+		trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
+						    EXT4_C2B(sbi, bit)),
+					       next - bit);
 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
 		bit = next + 1;
 	}
 	if (free != pa->pa_free) {
-		printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
-			pa, (unsigned long) pa->pa_lstart,
-			(unsigned long) pa->pa_pstart,
-			(unsigned long) pa->pa_len);
-		ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n",
-						free, pa->pa_free);
+		ext4_msg(e4b->bd_sb, KERN_CRIT,
+			 "pa %p: logic %lu, phys. %lu, len %lu",
+			 pa, (unsigned long) pa->pa_lstart,
+			 (unsigned long) pa->pa_pstart,
+			 (unsigned long) pa->pa_len);
+		ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
+					free, pa->pa_free);
 		/*
 		 * pa is already deleted so we use the value obtained
 		 * from the bitmap and continue.
 		 */
 	}
 	atomic_add(free, &sbi->s_mb_discarded);
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 
 	return err;
 }
 
-static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
+static noinline_for_stack int
+ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 				struct ext4_prealloc_space *pa)
 {
-	struct ext4_allocation_context *ac;
 	struct super_block *sb = e4b->bd_sb;
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-
-	if (ac)
-		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-
+	trace_ext4_mb_release_group_pa(sb, pa);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
 	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
-
-	if (ac) {
-		ac->ac_sb = sb;
-		ac->ac_inode = NULL;
-		ac->ac_b_ex.fe_group = group;
-		ac->ac_b_ex.fe_start = bit;
-		ac->ac_b_ex.fe_len = pa->pa_len;
-		ac->ac_b_ex.fe_logical = 0;
-		ext4_mb_store_history(ac);
-		kmem_cache_free(ext4_ac_cachep, ac);
-	}
+	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
 
 	return 0;
 }
@@ -3853,7 +3785,8 @@ static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
  * - how many do we discard
  *   1) how many requested
  */
-static int ext4_mb_discard_group_preallocations(struct super_block *sb,
+static noinline_for_stack int
+ext4_mb_discard_group_preallocations(struct super_block *sb,
 					ext4_group_t group, int needed)
 {
 	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
@@ -3865,27 +3798,28 @@ static int ext4_mb_discard_group_preallocations(struct super_block *sb,
 	int busy = 0;
 	int free = 0;
 
-	mb_debug("discard preallocation for group %lu\n", group);
+	mb_debug(1, "discard preallocation for group %u\n", group);
 
 	if (list_empty(&grp->bb_prealloc_list))
 		return 0;
 
-	bitmap_bh = read_block_bitmap(sb, group);
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
-		/* error handling here */
-		ext4_mb_release_desc(&e4b);
-		BUG_ON(bitmap_bh == NULL);
+		ext4_error(sb, "Error reading block bitmap for %u", group);
+		return 0;
 	}
 
 	err = ext4_mb_load_buddy(sb, group, &e4b);
-	BUG_ON(err != 0); /* error handling here */
+	if (err) {
+		ext4_error(sb, "Error loading buddy information for %u", group);
+		put_bh(bitmap_bh);
+		return 0;
+	}
 
 	if (needed == 0)
-		needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
+		needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
 
-	grp = ext4_get_group_info(sb, group);
 	INIT_LIST_HEAD(&list);
-
 repeat:
 	ext4_lock_group(sb, group);
 	list_for_each_entry_safe(pa, tmp,
@@ -3917,11 +3851,7 @@ repeat:
 	if (free < needed && busy) {
 		busy = 0;
 		ext4_unlock_group(sb, group);
-		/*
-		 * Yield the CPU here so that we don't get soft lockup
-		 * in non preempt case.
-		 */
-		yield();
+		cond_resched();
 		goto repeat;
 	}
 
@@ -3939,7 +3869,7 @@ repeat:
 		list_del_rcu(&pa->pa_inode_list);
 		spin_unlock(pa->pa_obj_lock);
 
-		if (pa->pa_linear)
+		if (pa->pa_type == MB_GROUP_PA)
 			ext4_mb_release_group_pa(&e4b, pa);
 		else
 			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
@@ -3950,7 +3880,7 @@ repeat:
 
 out:
 	ext4_unlock_group(sb, group);
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 	put_bh(bitmap_bh);
 	return free;
 }
@@ -3964,7 +3894,7 @@ out:
  *
  * FIXME!! Make sure it is valid at all the call sites
  */
-void ext4_mb_discard_inode_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -3975,12 +3905,13 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
 	struct ext4_buddy e4b;
 	int err;
 
-	if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+	if (!S_ISREG(inode->i_mode)) {
 		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
 		return;
 	}
 
-	mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+	mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
+	trace_ext4_discard_preallocations(inode);
 
 	INIT_LIST_HEAD(&list);
 
@@ -3997,7 +3928,8 @@ repeat:
 			 * use preallocation while we're discarding it */
 			spin_unlock(&pa->pa_lock);
 			spin_unlock(&ei->i_prealloc_lock);
-			printk(KERN_ERR "uh-oh! used pa while discarding\n");
+			ext4_msg(sb, KERN_ERR,
+				 "uh-oh! used pa while discarding");
 			WARN_ON(1);
 			schedule_timeout_uninterruptible(HZ);
 			goto repeat;
@@ -4033,17 +3965,22 @@ repeat:
 	spin_unlock(&ei->i_prealloc_lock);
 
 	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
-		BUG_ON(pa->pa_linear != 0);
-		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+		BUG_ON(pa->pa_type != MB_INODE_PA);
+		group = ext4_get_group_number(sb, pa->pa_pstart);
 
 		err = ext4_mb_load_buddy(sb, group, &e4b);
-		BUG_ON(err != 0); /* error handling here */
+		if (err) {
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
+			continue;
+		}
 
-		bitmap_bh = read_block_bitmap(sb, group);
+		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
-			/* error handling here */
-			ext4_mb_release_desc(&e4b);
-			BUG_ON(bitmap_bh == NULL);
+			ext4_error(sb, "Error reading block bitmap for %u",
+					group);
+			ext4_mb_unload_buddy(&e4b);
+			continue;
 		}
 
 		ext4_lock_group(sb, group);
@@ -4051,7 +3988,7 @@ repeat:
 		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
 		ext4_unlock_group(sb, group);
 
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 		put_bh(bitmap_bh);
 
 		list_del(&pa->u.pa_tmp_list);
@@ -4059,31 +3996,23 @@ repeat:
 	}
 }
 
-/*
- * finds all preallocated spaces and return blocks being freed to them
- * if preallocated space becomes full (no block is used from the space)
- * then the function frees space in buddy
- * XXX: at the moment, truncate (which is the only way to free blocks)
- * discards all preallocations
- */
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-					struct ext4_buddy *e4b,
-					sector_t block, int count)
-{
-	BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
-}
-#ifdef MB_DEBUG
+#ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
-	ext4_group_t i;
+	ext4_group_t ngroups, i;
+
+	if (!ext4_mballoc_debug ||
+	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
+		return;
 
-	printk(KERN_ERR "EXT4-fs: Can't allocate:"
-			" Allocation context details:\n");
-	printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
+	ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
+			" Allocation context details:");
+	ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
 			ac->ac_status, ac->ac_flags);
-	printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
-			"best %lu/%lu/%lu@%lu cr %d\n",
+	ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
+		 	"goal %lu/%lu/%lu@%lu, "
+			"best %lu/%lu/%lu@%lu cr %d",
 			(unsigned long)ac->ac_o_ex.fe_group,
 			(unsigned long)ac->ac_o_ex.fe_start,
 			(unsigned long)ac->ac_o_ex.fe_len,
@@ -4097,10 +4026,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 			(unsigned long)ac->ac_b_ex.fe_len,
 			(unsigned long)ac->ac_b_ex.fe_logical,
 			(int)ac->ac_criteria);
-	printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
-		ac->ac_found);
-	printk(KERN_ERR "EXT4-fs: groups: \n");
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
+	ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
+	ngroups = ext4_get_groups_count(sb);
+	for (i = 0; i < ngroups; i++) {
 		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
 		struct ext4_prealloc_space *pa;
 		ext4_grpblk_t start;
@@ -4113,14 +4042,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
 						     NULL, &start);
 			spin_unlock(&pa->pa_lock);
-			printk(KERN_ERR "PA:%lu:%d:%u \n", i,
-							start, pa->pa_len);
+			printk(KERN_ERR "PA:%u:%d:%u \n", i,
+			       start, pa->pa_len);
 		}
-		ext4_lock_group(sb, i);
+		ext4_unlock_group(sb, i);
 
 		if (grp->bb_free == 0)
 			continue;
-		printk(KERN_ERR "%lu: %d/%d \n",
+		printk(KERN_ERR "%u: %d/%d \n",
 		       i, grp->bb_free, grp->bb_fragments);
 	}
 	printk(KERN_ERR "\n");
@@ -4137,7 +4066,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
  * file is determined by the current size or the resulting size after
  * allocation which ever is larger
  *
- * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
  */
 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 {
@@ -4148,16 +4077,31 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
 		return;
 
-	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
-	isize = i_size_read(ac->ac_inode) >> bsbits;
-	size = max(size, isize);
+	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+		return;
 
-	/* don't use group allocation for large files */
-	if (size >= sbi->s_mb_stream_request)
+	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+		>> bsbits;
+
+	if ((size == isize) &&
+	    !ext4_fs_is_busy(sbi) &&
+	    (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
 		return;
+	}
 
-	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+	if (sbi->s_mb_group_prealloc <= 0) {
+		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
 		return;
+	}
+
+	/* don't use group allocation for large files */
+	size = max(size, isize);
+	if (size > sbi->s_mb_stream_request) {
+		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+		return;
+	}
 
 	BUG_ON(ac->ac_lg != NULL);
 	/*
@@ -4165,8 +4109,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	 * per cpu locality group is to reduce the contention between block
 	 * request from multiple CPUs.
 	 */
-	ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
-	put_cpu();
+	ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
 
 	/* we're going to use group allocation */
 	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4175,23 +4118,24 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	mutex_lock(&ac->ac_lg->lg_mutex);
 }
 
-static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
+static noinline_for_stack int
+ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 				struct ext4_allocation_request *ar)
 {
 	struct super_block *sb = ar->inode->i_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	ext4_group_t group;
-	unsigned long len;
-	unsigned long goal;
+	unsigned int len;
+	ext4_fsblk_t goal;
 	ext4_grpblk_t block;
 
 	/* we can't allocate > group size */
 	len = ar->len;
 
 	/* just a dirty hack to filter too big requests  */
-	if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
-		len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
+	if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
+		len = EXT4_CLUSTERS_PER_GROUP(sb);
 
 	/* start searching from the goal */
 	goal = ar->goal;
@@ -4201,38 +4145,22 @@ static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	ext4_get_group_no_and_offset(sb, goal, &group, &block);
 
 	/* set up allocation goals */
-	ac->ac_b_ex.fe_logical = ar->logical;
-	ac->ac_b_ex.fe_group = 0;
-	ac->ac_b_ex.fe_start = 0;
-	ac->ac_b_ex.fe_len = 0;
+	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
 	ac->ac_status = AC_STATUS_CONTINUE;
-	ac->ac_groups_scanned = 0;
-	ac->ac_ex_scanned = 0;
-	ac->ac_found = 0;
 	ac->ac_sb = sb;
 	ac->ac_inode = ar->inode;
-	ac->ac_o_ex.fe_logical = ar->logical;
+	ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
 	ac->ac_o_ex.fe_group = group;
 	ac->ac_o_ex.fe_start = block;
 	ac->ac_o_ex.fe_len = len;
-	ac->ac_g_ex.fe_logical = ar->logical;
-	ac->ac_g_ex.fe_group = group;
-	ac->ac_g_ex.fe_start = block;
-	ac->ac_g_ex.fe_len = len;
-	ac->ac_f_ex.fe_len = 0;
+	ac->ac_g_ex = ac->ac_o_ex;
 	ac->ac_flags = ar->flags;
-	ac->ac_2order = 0;
-	ac->ac_criteria = 0;
-	ac->ac_pa = NULL;
-	ac->ac_bitmap_page = NULL;
-	ac->ac_buddy_page = NULL;
-	ac->ac_lg = NULL;
 
 	/* we have to define context: we'll we work with a file or
 	 * locality group. this is a policy, actually */
 	ext4_mb_group_or_file(ac);
 
-	mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+	mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
 			"left: %u/%u, right %u/%u to %swritable\n",
 			(unsigned) ar->len, (unsigned) ar->logical,
 			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4243,22 +4171,167 @@ static int ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 
 }
 
+static noinline_for_stack void
+ext4_mb_discard_lg_preallocations(struct super_block *sb,
+					struct ext4_locality_group *lg,
+					int order, int total_entries)
+{
+	ext4_group_t group = 0;
+	struct ext4_buddy e4b;
+	struct list_head discard_list;
+	struct ext4_prealloc_space *pa, *tmp;
+
+	mb_debug(1, "discard locality group preallocation\n");
+
+	INIT_LIST_HEAD(&discard_list);
+
+	spin_lock(&lg->lg_prealloc_lock);
+	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
+						pa_inode_list) {
+		spin_lock(&pa->pa_lock);
+		if (atomic_read(&pa->pa_count)) {
+			/*
+			 * This is the pa that we just used
+			 * for block allocation. So don't
+			 * free that
+			 */
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		if (pa->pa_deleted) {
+			spin_unlock(&pa->pa_lock);
+			continue;
+		}
+		/* only lg prealloc space */
+		BUG_ON(pa->pa_type != MB_GROUP_PA);
+
+		/* seems this one can be freed ... */
+		pa->pa_deleted = 1;
+		spin_unlock(&pa->pa_lock);
+
+		list_del_rcu(&pa->pa_inode_list);
+		list_add(&pa->u.pa_tmp_list, &discard_list);
+
+		total_entries--;
+		if (total_entries <= 5) {
+			/*
+			 * we want to keep only 5 entries
+			 * allowing it to grow to 8. This
+			 * mak sure we don't call discard
+			 * soon for this list.
+			 */
+			break;
+		}
+	}
+	spin_unlock(&lg->lg_prealloc_lock);
+
+	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
+
+		group = ext4_get_group_number(sb, pa->pa_pstart);
+		if (ext4_mb_load_buddy(sb, group, &e4b)) {
+			ext4_error(sb, "Error loading buddy information for %u",
+					group);
+			continue;
+		}
+		ext4_lock_group(sb, group);
+		list_del(&pa->pa_group_list);
+		ext4_mb_release_group_pa(&e4b, pa);
+		ext4_unlock_group(sb, group);
+
+		ext4_mb_unload_buddy(&e4b);
+		list_del(&pa->u.pa_tmp_list);
+		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+	}
+}
+
+/*
+ * We have incremented pa_count. So it cannot be freed at this
+ * point. Also we hold lg_mutex. So no parallel allocation is
+ * possible from this lg. That means pa_free cannot be updated.
+ *
+ * A parallel ext4_mb_discard_group_preallocations is possible.
+ * which can cause the lg_prealloc_list to be updated.
+ */
+
+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+{
+	int order, added = 0, lg_prealloc_count = 1;
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_locality_group *lg = ac->ac_lg;
+	struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
+
+	order = fls(pa->pa_free) - 1;
+	if (order > PREALLOC_TB_SIZE - 1)
+		/* The max size of hash table is PREALLOC_TB_SIZE */
+		order = PREALLOC_TB_SIZE - 1;
+	/* Add the prealloc space to lg */
+	spin_lock(&lg->lg_prealloc_lock);
+	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
+						pa_inode_list) {
+		spin_lock(&tmp_pa->pa_lock);
+		if (tmp_pa->pa_deleted) {
+			spin_unlock(&tmp_pa->pa_lock);
+			continue;
+		}
+		if (!added && pa->pa_free < tmp_pa->pa_free) {
+			/* Add to the tail of the previous entry */
+			list_add_tail_rcu(&pa->pa_inode_list,
+						&tmp_pa->pa_inode_list);
+			added = 1;
+			/*
+			 * we want to count the total
+			 * number of entries in the list
+			 */
+		}
+		spin_unlock(&tmp_pa->pa_lock);
+		lg_prealloc_count++;
+	}
+	if (!added)
+		list_add_tail_rcu(&pa->pa_inode_list,
+					&lg->lg_prealloc_list[order]);
+	spin_unlock(&lg->lg_prealloc_lock);
+
+	/* Now trim the list to be not more than 8 elements */
+	if (lg_prealloc_count > 8) {
+		ext4_mb_discard_lg_preallocations(sb, lg,
+						  order, lg_prealloc_count);
+		return;
+	}
+	return ;
+}
+
 /*
  * release all resource we used in allocation
  */
 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
-	if (ac->ac_pa) {
-		if (ac->ac_pa->pa_linear) {
+	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+	struct ext4_prealloc_space *pa = ac->ac_pa;
+	if (pa) {
+		if (pa->pa_type == MB_GROUP_PA) {
 			/* see comment in ext4_mb_use_group_pa() */
-			spin_lock(&ac->ac_pa->pa_lock);
-			ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
-			ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
-			ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
-			ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
-			spin_unlock(&ac->ac_pa->pa_lock);
+			spin_lock(&pa->pa_lock);
+			pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+			pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+			pa->pa_free -= ac->ac_b_ex.fe_len;
+			pa->pa_len -= ac->ac_b_ex.fe_len;
+			spin_unlock(&pa->pa_lock);
+		}
+	}
+	if (pa) {
+		/*
+		 * We want to add the pa to the right bucket.
+		 * Remove it from the list and while adding
+		 * make sure the list to which we are adding
+		 * doesn't grow big.
+		 */
+		if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
+			spin_lock(pa->pa_obj_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			spin_unlock(pa->pa_obj_lock);
+			ext4_mb_add_n_trim(ac);
 		}
-		ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
+		ext4_mb_put_pa(ac, ac->ac_sb, pa);
 	}
 	if (ac->ac_bitmap_page)
 		page_cache_release(ac->ac_bitmap_page);
@@ -4272,11 +4345,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 
 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 {
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 	int ret;
 	int freed = 0;
 
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+	trace_ext4_mb_discard_preallocations(sb, needed);
+	for (i = 0; i < ngroups && needed > 0; i++) {
 		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
 		freed += ret;
 		needed -= ret;
@@ -4291,42 +4365,76 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
  * to usual allocation
  */
 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
-				 struct ext4_allocation_request *ar, int *errp)
+				struct ext4_allocation_request *ar, int *errp)
 {
+	int freed;
 	struct ext4_allocation_context *ac = NULL;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
-	int freed;
-	int inquota;
+	unsigned int inquota = 0;
+	unsigned int reserv_clstrs = 0;
 
+	might_sleep();
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
 
-	if (!test_opt(sb, MBALLOC)) {
-		block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
-					    &(ar->len), errp);
-		return block;
-	}
+	trace_ext4_request_blocks(ar);
 
-	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
-		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
-		ar->len--;
-	}
-	if (ar->len == 0) {
-		*errp = -EDQUOT;
-		return 0;
+	/* Allow to use superuser reservation for quota file */
+	if (IS_NOQUOTA(ar->inode))
+		ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
+
+	/*
+	 * For delayed allocation, we could skip the ENOSPC and
+	 * EDQUOT check, as blocks and quotas have been already
+	 * reserved when data being copied into pagecache.
+	 */
+	if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
+		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+	else {
+		/* Without delayed allocation we need to verify
+		 * there is enough free blocks to do block allocation
+		 * and verify allocation doesn't exceed the quota limits.
+		 */
+		while (ar->len &&
+			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
+
+			/* let others to free the space */
+			cond_resched();
+			ar->len = ar->len >> 1;
+		}
+		if (!ar->len) {
+			*errp = -ENOSPC;
+			return 0;
+		}
+		reserv_clstrs = ar->len;
+		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+			dquot_alloc_block_nofail(ar->inode,
+						 EXT4_C2B(sbi, ar->len));
+		} else {
+			while (ar->len &&
+				dquot_alloc_block(ar->inode,
+						  EXT4_C2B(sbi, ar->len))) {
+
+				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+				ar->len--;
+			}
+		}
+		inquota = ar->len;
+		if (ar->len == 0) {
+			*errp = -EDQUOT;
+			goto out;
+		}
 	}
-	inquota = ar->len;
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
 	if (!ac) {
+		ar->len = 0;
 		*errp = -ENOMEM;
-		return 0;
+		goto out;
 	}
 
-	ext4_mb_poll_new_transaction(sb, handle);
-
 	*errp = ext4_mb_initialize_context(ac, ar);
 	if (*errp) {
 		ar->len = 0;
@@ -4335,212 +4443,315 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 
 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
 	if (!ext4_mb_use_preallocated(ac)) {
-
 		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
 		ext4_mb_normalize_request(ac, ar);
-
 repeat:
 		/* allocate space in core */
-		ext4_mb_regular_allocator(ac);
+		*errp = ext4_mb_regular_allocator(ac);
+		if (*errp)
+			goto discard_and_exit;
 
 		/* as we've just preallocated more space than
-		 * user requested orinally, we store allocated
+		 * user requested originally, we store allocated
 		 * space in a special descriptor */
 		if (ac->ac_status == AC_STATUS_FOUND &&
-				ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
-			ext4_mb_new_preallocation(ac);
+		    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+			*errp = ext4_mb_new_preallocation(ac);
+		if (*errp) {
+		discard_and_exit:
+			ext4_discard_allocated_blocks(ac);
+			goto errout;
+		}
 	}
-
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-		ext4_mb_mark_diskspace_used(ac, handle);
-		*errp = 0;
-		block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-		ar->len = ac->ac_b_ex.fe_len;
+		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
+		if (*errp == -EAGAIN) {
+			/*
+			 * drop the reference that we took
+			 * in ext4_mb_use_best_found
+			 */
+			ext4_mb_release_context(ac);
+			ac->ac_b_ex.fe_group = 0;
+			ac->ac_b_ex.fe_start = 0;
+			ac->ac_b_ex.fe_len = 0;
+			ac->ac_status = AC_STATUS_CONTINUE;
+			goto repeat;
+		} else if (*errp) {
+			ext4_discard_allocated_blocks(ac);
+			goto errout;
+		} else {
+			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+			ar->len = ac->ac_b_ex.fe_len;
+		}
 	} else {
 		freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
 		if (freed)
 			goto repeat;
 		*errp = -ENOSPC;
+	}
+
+errout:
+	if (*errp) {
 		ac->ac_b_ex.fe_len = 0;
 		ar->len = 0;
 		ext4_mb_show_ac(ac);
 	}
-
 	ext4_mb_release_context(ac);
-
 out:
-	if (ar->len < inquota)
-		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+	if (ac)
+		kmem_cache_free(ext4_ac_cachep, ac);
+	if (inquota && ar->len < inquota)
+		dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
+	if (!ar->len) {
+		if (!ext4_test_inode_state(ar->inode,
+					   EXT4_STATE_DELALLOC_RESERVED))
+			/* release all the reserved blocks if non delalloc */
+			percpu_counter_sub(&sbi->s_dirtyclusters_counter,
+						reserv_clstrs);
+	}
+
+	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
 
-	kmem_cache_free(ext4_ac_cachep, ac);
 	return block;
 }
-static void ext4_mb_poll_new_transaction(struct super_block *sb,
-						handle_t *handle)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-	if (sbi->s_last_transaction == handle->h_transaction->t_tid)
-		return;
 
-	/* new transaction! time to close last one and free blocks for
-	 * committed transaction. we know that only transaction can be
-	 * active, so previos transaction can be being logged and we
-	 * know that transaction before previous is known to be already
-	 * logged. this means that now we may free blocks freed in all
-	 * transactions before previous one. hope I'm clear enough ... */
-
-	spin_lock(&sbi->s_md_lock);
-	if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
-		mb_debug("new transaction %lu, old %lu\n",
-				(unsigned long) handle->h_transaction->t_tid,
-				(unsigned long) sbi->s_last_transaction);
-		list_splice_init(&sbi->s_closed_transaction,
-				&sbi->s_committed_transaction);
-		list_splice_init(&sbi->s_active_transaction,
-				&sbi->s_closed_transaction);
-		sbi->s_last_transaction = handle->h_transaction->t_tid;
-	}
-	spin_unlock(&sbi->s_md_lock);
-
-	ext4_mb_free_committed_blocks(sb);
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+			struct ext4_free_data *entry2)
+{
+	if ((entry1->efd_tid == entry2->efd_tid) &&
+	    (entry1->efd_group == entry2->efd_group) &&
+	    ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
+		return 1;
+	return 0;
 }
 
-static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
-			  ext4_group_t group, ext4_grpblk_t block, int count)
+static noinline_for_stack int
+ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
+		      struct ext4_free_data *new_entry)
 {
+	ext4_group_t group = e4b->bd_group;
+	ext4_grpblk_t cluster;
+	struct ext4_free_data *entry;
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_free_metadata *md;
-	int i;
+	struct rb_node **n = &db->bb_free_root.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node;
 
+	BUG_ON(!ext4_handle_valid(handle));
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	ext4_lock_group(sb, group);
-	for (i = 0; i < count; i++) {
-		md = db->bb_md_cur;
-		if (md && db->bb_tid != handle->h_transaction->t_tid) {
-			db->bb_md_cur = NULL;
-			md = NULL;
+	new_node = &new_entry->efd_node;
+	cluster = new_entry->efd_start_cluster;
+
+	if (!*n) {
+		/* first free block exent. We need to
+		   protect buddy cache from being freed,
+		 * otherwise we'll refresh it from
+		 * on-disk bitmap and lose not-yet-available
+		 * blocks */
+		page_cache_get(e4b->bd_buddy_page);
+		page_cache_get(e4b->bd_bitmap_page);
+	}
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_free_data, efd_node);
+		if (cluster < entry->efd_start_cluster)
+			n = &(*n)->rb_left;
+		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
+			n = &(*n)->rb_right;
+		else {
+			ext4_grp_locked_error(sb, group, 0,
+				ext4_group_first_block_no(sb, group) +
+				EXT4_C2B(sbi, cluster),
+				"Block already on to-be-freed list");
+			return 0;
 		}
+	}
 
-		if (md == NULL) {
-			ext4_unlock_group(sb, group);
-			md = kmalloc(sizeof(*md), GFP_NOFS);
-			if (md == NULL)
-				return -ENOMEM;
-			md->num = 0;
-			md->group = group;
-
-			ext4_lock_group(sb, group);
-			if (db->bb_md_cur == NULL) {
-				spin_lock(&sbi->s_md_lock);
-				list_add(&md->list, &sbi->s_active_transaction);
-				spin_unlock(&sbi->s_md_lock);
-				/* protect buddy cache from being freed,
-				 * otherwise we'll refresh it from
-				 * on-disk bitmap and lose not-yet-available
-				 * blocks */
-				page_cache_get(e4b->bd_buddy_page);
-				page_cache_get(e4b->bd_bitmap_page);
-				db->bb_md_cur = md;
-				db->bb_tid = handle->h_transaction->t_tid;
-				mb_debug("new md 0x%p for group %lu\n",
-						md, md->group);
-			} else {
-				kfree(md);
-				md = db->bb_md_cur;
-			}
+	rb_link_node(new_node, parent, n);
+	rb_insert_color(new_node, &db->bb_free_root);
+
+	/* Now try to see the extent can be merged to left and right */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, efd_node);
+		if (can_merge(entry, new_entry) &&
+		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+			new_entry->efd_start_cluster = entry->efd_start_cluster;
+			new_entry->efd_count += entry->efd_count;
+			rb_erase(node, &(db->bb_free_root));
+			kmem_cache_free(ext4_free_data_cachep, entry);
 		}
+	}
 
-		BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
-		md->blocks[md->num] = block + i;
-		md->num++;
-		if (md->num == EXT4_BB_MAX_BLOCKS) {
-			/* no more space, put full container on a sb's list */
-			db->bb_md_cur = NULL;
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_free_data, efd_node);
+		if (can_merge(new_entry, entry) &&
+		    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
+			new_entry->efd_count += entry->efd_count;
+			rb_erase(node, &(db->bb_free_root));
+			kmem_cache_free(ext4_free_data_cachep, entry);
 		}
 	}
-	ext4_unlock_group(sb, group);
+	/* Add the extent to transaction's private list */
+	ext4_journal_callback_add(handle, ext4_free_data_callback,
+				  &new_entry->efd_jce);
 	return 0;
 }
 
-/*
- * Main entry point into mballoc to free blocks
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle:		handle for this transaction
+ * @inode:		inode
+ * @block:		start physical block to free
+ * @count:		number of blocks to count
+ * @flags:		flags used by ext4_free_blocks
  */
-void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
-			unsigned long block, unsigned long count,
-			int metadata, unsigned long *freed)
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+		      struct buffer_head *bh, ext4_fsblk_t block,
+		      unsigned long count, int flags)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct super_block *sb = inode->i_sb;
-	struct ext4_allocation_context *ac = NULL;
 	struct ext4_group_desc *gdp;
-	struct ext4_super_block *es;
-	unsigned long overflow;
+	unsigned int overflow;
 	ext4_grpblk_t bit;
 	struct buffer_head *gd_bh;
 	ext4_group_t block_group;
 	struct ext4_sb_info *sbi;
+	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_buddy e4b;
+	unsigned int count_clusters;
 	int err = 0;
 	int ret;
 
-	*freed = 0;
-
-	ext4_mb_poll_new_transaction(sb, handle);
+	might_sleep();
+	if (bh) {
+		if (block)
+			BUG_ON(block != bh->b_blocknr);
+		else
+			block = bh->b_blocknr;
+	}
 
 	sbi = EXT4_SB(sb);
-	es = EXT4_SB(sb)->s_es;
-	if (block < le32_to_cpu(es->s_first_data_block) ||
-	    block + count < block ||
-	    block + count > ext4_blocks_count(es)) {
-		ext4_error(sb, __FUNCTION__,
-			    "Freeing blocks not in datazone - "
-			    "block = %lu, count = %lu", block, count);
+	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+	    !ext4_data_block_valid(sbi, block, count)) {
+		ext4_error(sb, "Freeing blocks not in datazone - "
+			   "block = %llu, count = %lu", block, count);
 		goto error_return;
 	}
 
-	ext4_debug("freeing block %lu\n", block);
+	ext4_debug("freeing block %llu\n", block);
+	trace_ext4_free_blocks(inode, block, count, flags);
+
+	if (flags & EXT4_FREE_BLOCKS_FORGET) {
+		struct buffer_head *tbh = bh;
+		int i;
+
+		BUG_ON(bh && (count > 1));
+
+		for (i = 0; i < count; i++) {
+			cond_resched();
+			if (!bh)
+				tbh = sb_find_get_block(inode->i_sb,
+							block + i);
+			if (!tbh)
+				continue;
+			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+				    inode, tbh, block + i);
+		}
+	}
+
+	/*
+	 * We need to make sure we don't reuse the freed block until
+	 * after the transaction is committed, which we can do by
+	 * treating the block as metadata, below.  We make an
+	 * exception if the inode is to be written in writeback mode
+	 * since writeback mode has weak data consistency guarantees.
+	 */
+	if (!ext4_should_writeback_data(inode))
+		flags |= EXT4_FREE_BLOCKS_METADATA;
 
-	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
-	if (ac) {
-		ac->ac_op = EXT4_MB_HISTORY_FREE;
-		ac->ac_inode = inode;
-		ac->ac_sb = sb;
+	/*
+	 * If the extent to be freed does not begin on a cluster
+	 * boundary, we need to deal with partial clusters at the
+	 * beginning and end of the extent.  Normally we will free
+	 * blocks at the beginning or the end unless we are explicitly
+	 * requested to avoid doing so.
+	 */
+	overflow = EXT4_PBLK_COFF(sbi, block);
+	if (overflow) {
+		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+			overflow = sbi->s_cluster_ratio - overflow;
+			block += overflow;
+			if (count > overflow)
+				count -= overflow;
+			else
+				return;
+		} else {
+			block -= overflow;
+			count += overflow;
+		}
+	}
+	overflow = EXT4_LBLK_COFF(sbi, count);
+	if (overflow) {
+		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+			if (count > overflow)
+				count -= overflow;
+			else
+				return;
+		} else
+			count += sbi->s_cluster_ratio - overflow;
 	}
 
 do_more:
 	overflow = 0;
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
 
+	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
+			ext4_get_group_info(sb, block_group))))
+		return;
+
 	/*
 	 * Check to see if we are freeing blocks across a group
 	 * boundary.
 	 */
-	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+	if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+		overflow = EXT4_C2B(sbi, bit) + count -
+			EXT4_BLOCKS_PER_GROUP(sb);
 		count -= overflow;
 	}
-	bitmap_bh = read_block_bitmap(sb, block_group);
-	if (!bitmap_bh)
+	count_clusters = EXT4_NUM_B2C(sbi, count);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		err = -EIO;
 		goto error_return;
+	}
 	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
-	if (!gdp)
+	if (!gdp) {
+		err = -EIO;
 		goto error_return;
+	}
 
 	if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
 	    in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
 	    in_range(block, ext4_inode_table(sb, gdp),
-		      EXT4_SB(sb)->s_itb_per_group) ||
+		     EXT4_SB(sb)->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, gdp),
-		      EXT4_SB(sb)->s_itb_per_group)) {
+		     EXT4_SB(sb)->s_itb_per_group)) {
 
-		ext4_error(sb, __FUNCTION__,
-			   "Freeing blocks in system zone - "
-			   "Block = %lu, count = %lu", block, count);
+		ext4_error(sb, "Freeing blocks in system zone - "
+			   "Block = %llu, count = %lu", block, count);
+		/* err = 0. ext4_std_error should be a no op */
+		goto error_return;
 	}
 
 	BUFFER_TRACE(bitmap_bh, "getting write access");
@@ -4557,58 +4768,101 @@ do_more:
 	err = ext4_journal_get_write_access(handle, gd_bh);
 	if (err)
 		goto error_return;
-
-	err = ext4_mb_load_buddy(sb, block_group, &e4b);
-	if (err)
-		goto error_return;
-
 #ifdef AGGRESSIVE_CHECK
 	{
 		int i;
-		for (i = 0; i < count; i++)
+		for (i = 0; i < count_clusters; i++)
 			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
 	}
 #endif
-	mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-			bit, count);
+	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
 
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
 
-	if (ac) {
-		ac->ac_b_ex.fe_group = block_group;
-		ac->ac_b_ex.fe_start = bit;
-		ac->ac_b_ex.fe_len = count;
-		ext4_mb_store_history(ac);
-	}
+	if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
+		struct ext4_free_data *new_entry;
+		/*
+		 * blocks being freed are metadata. these blocks shouldn't
+		 * be used until this transaction is committed
+		 */
+	retry:
+		new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
+		if (!new_entry) {
+			/*
+			 * We use a retry loop because
+			 * ext4_free_blocks() is not allowed to fail.
+			 */
+			cond_resched();
+			congestion_wait(BLK_RW_ASYNC, HZ/50);
+			goto retry;
+		}
+		new_entry->efd_start_cluster = bit;
+		new_entry->efd_group = block_group;
+		new_entry->efd_count = count_clusters;
+		new_entry->efd_tid = handle->h_transaction->t_tid;
 
-	if (metadata) {
-		/* blocks being freed are metadata. these blocks shouldn't
-		 * be used until this transaction is committed */
-		ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+		ext4_mb_free_metadata(handle, &e4b, new_entry);
 	} else {
+		/* need to update group_info->bb_free and bitmap
+		 * with group lock held. generate_buddy look at
+		 * them with group lock_held
+		 */
+		if (test_opt(sb, DISCARD)) {
+			err = ext4_issue_discard(sb, block_group, bit, count);
+			if (err && err != -EOPNOTSUPP)
+				ext4_msg(sb, KERN_WARNING, "discard request in"
+					 " group:%d block:%d count:%lu failed"
+					 " with %d", block_group, bit, count,
+					 err);
+		} else
+			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
+
 		ext4_lock_group(sb, block_group);
-		err = mb_free_blocks(inode, &e4b, bit, count);
-		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-		ext4_unlock_group(sb, block_group);
-		BUG_ON(err != 0);
+		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
+		mb_free_blocks(inode, &e4b, bit, count_clusters);
 	}
 
-	spin_lock(sb_bgl_lock(sbi, block_group));
-	gdp->bg_free_blocks_count =
-		cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count);
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_add(&sbi->s_freeblocks_counter, count);
+	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
+	ext4_free_group_clusters_set(sb, gdp, ret);
+	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
+	ext4_group_desc_csum_set(sb, block_group, gdp);
+	ext4_unlock_group(sb, block_group);
 
-	ext4_mb_release_desc(&e4b);
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		atomic64_add(count_clusters,
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+	}
 
-	*freed += count;
+	if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
+		percpu_counter_add(&sbi->s_dirtyclusters_counter,
+				   count_clusters);
+		spin_lock(&ei->i_block_reservation_lock);
+		if (flags & EXT4_FREE_BLOCKS_METADATA)
+			ei->i_reserved_meta_blocks += count_clusters;
+		else
+			ei->i_reserved_data_blocks += count_clusters;
+		spin_unlock(&ei->i_block_reservation_lock);
+		if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+			dquot_reclaim_block(inode,
+					EXT4_C2B(sbi, count_clusters));
+	} else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
+		dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
+	percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
+
+	ext4_mb_unload_buddy(&e4b);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 
 	/* And the group descriptor block */
 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_journal_dirty_metadata(handle, gd_bh);
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
 	if (!err)
 		err = ret;
 
@@ -4618,11 +4872,365 @@ do_more:
 		put_bh(bitmap_bh);
 		goto do_more;
 	}
-	sb->s_dirt = 1;
 error_return:
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
-	if (ac)
-		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
+
+/**
+ * ext4_group_add_blocks() -- Add given blocks to an existing group
+ * @handle:			handle to this transaction
+ * @sb:				super block
+ * @block:			start physical block to add to the block group
+ * @count:			number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
+			 ext4_fsblk_t block, unsigned long count)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gd_bh;
+	ext4_group_t block_group;
+	ext4_grpblk_t bit;
+	unsigned int i;
+	struct ext4_group_desc *desc;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_buddy e4b;
+	int err = 0, ret, blk_free_count;
+	ext4_grpblk_t blocks_freed;
+
+	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+	if (count == 0)
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+		ext4_warning(sb, "too much blocks added to group %u\n",
+			     block_group);
+		err = -EINVAL;
+		goto error_return;
+	}
+
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		err = -EIO;
+		goto error_return;
+	}
+
+	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+	if (!desc) {
+		err = -EIO;
+		goto error_return;
+	}
+
+	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+	    in_range(block + count - 1, ext4_inode_table(sb, desc),
+		     sbi->s_itb_per_group)) {
+		ext4_error(sb, "Adding blocks in system zones - "
+			   "Block = %llu, count = %lu",
+			   block, count);
+		err = -EINVAL;
+		goto error_return;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "getting write access");
+	err = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+
+	for (i = 0, blocks_freed = 0; i < count; i++) {
+		BUFFER_TRACE(bitmap_bh, "clear bit");
+		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+			ext4_error(sb, "bit already cleared for block %llu",
+				   (ext4_fsblk_t)(block + i));
+			BUFFER_TRACE(bitmap_bh, "bit already cleared");
+		} else {
+			blocks_freed++;
+		}
+	}
+
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
+
+	/*
+	 * need to update group_info->bb_free and bitmap
+	 * with group lock held. generate_buddy look at
+	 * them with group lock_held
+	 */
+	ext4_lock_group(sb, block_group);
+	mb_clear_bits(bitmap_bh->b_data, bit, count);
+	mb_free_blocks(NULL, &e4b, bit, count);
+	blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
+	ext4_free_group_clusters_set(sb, desc, blk_free_count);
+	ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
+	ext4_group_desc_csum_set(sb, block_group, desc);
+	ext4_unlock_group(sb, block_group);
+	percpu_counter_add(&sbi->s_freeclusters_counter,
+			   EXT4_NUM_B2C(sbi, blocks_freed));
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+	}
+
+	ext4_mb_unload_buddy(&e4b);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+	if (!err)
+		err = ret;
+
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, err);
+	return err;
+}
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:		super block for the file system
+ * @start:	starting block of the free extent in the alloc. group
+ * @count:	number of blocks to TRIM
+ * @group:	alloc. group we are working with
+ * @e4b:	ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+			     ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
+{
+	struct ext4_free_extent ex;
+	int ret = 0;
+
+	trace_ext4_trim_extent(sb, group, start, count);
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	/*
+	 * Mark blocks used, so no one can reuse them while
+	 * being trimmed.
+	 */
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+	ret = ext4_issue_discard(sb, group, start, count);
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+	return ret;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @group:		group to be trimmed
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @minblocks:		minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+		   ext4_grpblk_t start, ext4_grpblk_t max,
+		   ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t next, count = 0, free_count = 0;
+	struct ext4_buddy e4b;
+	int ret = 0;
+
+	trace_ext4_trim_all_free(sb, group, start, max);
+
+	ret = ext4_mb_load_buddy(sb, group, &e4b);
+	if (ret) {
+		ext4_error(sb, "Error in loading buddy "
+				"information for %u", group);
+		return ret;
+	}
+	bitmap = e4b.bd_bitmap;
+
+	ext4_lock_group(sb, group);
+	if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
+	    minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
+		goto out;
+
+	start = (e4b.bd_info->bb_first_free > start) ?
+		e4b.bd_info->bb_first_free : start;
+
+	while (start <= max) {
+		start = mb_find_next_zero_bit(bitmap, max + 1, start);
+		if (start > max)
+			break;
+		next = mb_find_next_bit(bitmap, max + 1, start);
+
+		if ((next - start) >= minblocks) {
+			ret = ext4_trim_extent(sb, start,
+					       next - start, group, &e4b);
+			if (ret && ret != -EOPNOTSUPP)
+				break;
+			ret = 0;
+			count += next - start;
+		}
+		free_count += next - start;
+		start = next + 1;
+
+		if (fatal_signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched()) {
+			ext4_unlock_group(sb, group);
+			cond_resched();
+			ext4_lock_group(sb, group);
+		}
+
+		if ((e4b.bd_info->bb_free - free_count) < minblocks)
+			break;
+	}
+
+	if (!ret) {
+		ret = count;
+		EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+	}
+out:
+	ext4_unlock_group(sb, group);
+	ext4_mb_unload_buddy(&e4b);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	return ret;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @range:		fstrim_range structure
+ *
+ * start:	First Byte to trim
+ * len:		number of Bytes to trim from start
+ * minlen:	minimum extent length in Bytes
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
+{
+	struct ext4_group_info *grp;
+	ext4_group_t group, first_group, last_group;
+	ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
+	uint64_t start, end, minlen, trimmed = 0;
+	ext4_fsblk_t first_data_blk =
+			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+	ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
+	int ret = 0;
+
+	start = range->start >> sb->s_blocksize_bits;
+	end = start + (range->len >> sb->s_blocksize_bits) - 1;
+	minlen = EXT4_NUM_B2C(EXT4_SB(sb),
+			      range->minlen >> sb->s_blocksize_bits);
+
+	if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
+	    start >= max_blks ||
+	    range->len < sb->s_blocksize)
+		return -EINVAL;
+	if (end >= max_blks)
+		end = max_blks - 1;
+	if (end <= first_data_blk)
+		goto out;
+	if (start < first_data_blk)
+		start = first_data_blk;
+
+	/* Determine first and last group to examine based on start and end */
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
+				     &first_group, &first_cluster);
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
+				     &last_group, &last_cluster);
+
+	/* end now represents the last cluster to discard in this group */
+	end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+
+	for (group = first_group; group <= last_group; group++) {
+		grp = ext4_get_group_info(sb, group);
+		/* We only do this if the grp has never been initialized */
+		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+			ret = ext4_mb_init_group(sb, group);
+			if (ret)
+				break;
+		}
+
+		/*
+		 * For all the groups except the last one, last cluster will
+		 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
+		 * change it for the last group, note that last_cluster is
+		 * already computed earlier by ext4_get_group_no_and_offset()
+		 */
+		if (group == last_group)
+			end = last_cluster;
+
+		if (grp->bb_free >= minlen) {
+			cnt = ext4_trim_all_free(sb, group, first_cluster,
+						end, minlen);
+			if (cnt < 0) {
+				ret = cnt;
+				break;
+			}
+			trimmed += cnt;
+		}
+
+		/*
+		 * For every group except the first one, we are sure
+		 * that the first cluster to discard will be cluster #0.
+		 */
+		first_cluster = 0;
+	}
+
+	if (!ret)
+		atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+
+out:
+	range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
+	return ret;
+}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
new file mode 100644
index 00000000000..d634e183b4d
--- /dev/null
+++ b/fs/ext4/mballoc.h
@@ -0,0 +1,215 @@
+/*
+ *  fs/ext4/mballoc.h
+ *
+ *  Written by: Alex Tomas <alex@clusterfs.com>
+ *
+ */
+#ifndef _EXT4_MBALLOC_H
+#define _EXT4_MBALLOC_H
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+
+/*
+ * with AGGRESSIVE_CHECK allocator runs consistency checks over
+ * structures. these checks slow things down a lot
+ */
+#define AGGRESSIVE_CHECK__
+
+/*
+ * with DOUBLE_CHECK defined mballoc creates persistent in-core
+ * bitmaps, maintains and uses them to check for double allocations
+ */
+#define DOUBLE_CHECK__
+
+/*
+ */
+#ifdef CONFIG_EXT4_DEBUG
+extern ushort ext4_mballoc_debug;
+
+#define mb_debug(n, fmt, a...)	                                        \
+	do {								\
+		if ((n) <= ext4_mballoc_debug) {		        \
+			printk(KERN_DEBUG "(%s, %d): %s: ",		\
+			       __FILE__, __LINE__, __func__);		\
+			printk(fmt, ## a);				\
+		}							\
+	} while (0)
+#else
+#define mb_debug(n, fmt, a...)		no_printk(fmt, ## a)
+#endif
+
+#define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
+#define EXT4_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */
+
+/*
+ * How long mballoc can look for a best extent (in found extents)
+ */
+#define MB_DEFAULT_MAX_TO_SCAN		200
+
+/*
+ * How long mballoc must look for a best extent
+ */
+#define MB_DEFAULT_MIN_TO_SCAN		10
+
+/*
+ * with 'ext4_mb_stats' allocator will collect stats that will be
+ * shown at umount. The collecting costs though!
+ */
+#define MB_DEFAULT_STATS		0
+
+/*
+ * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
+ * by the stream allocator, which purpose is to pack requests
+ * as close each to other as possible to produce smooth I/O traffic
+ * We use locality group prealloc space for stream request.
+ * We can tune the same via /proc/fs/ext4/<parition>/stream_req
+ */
+#define MB_DEFAULT_STREAM_THRESHOLD	16	/* 64K */
+
+/*
+ * for which requests use 2^N search using buddies
+ */
+#define MB_DEFAULT_ORDER2_REQS		2
+
+/*
+ * default group prealloc size 512 blocks
+ */
+#define MB_DEFAULT_GROUP_PREALLOC	512
+
+
+struct ext4_free_data {
+	/* MUST be the first member */
+	struct ext4_journal_cb_entry	efd_jce;
+
+	/* ext4_free_data private data starts from here */
+
+	/* this links the free block information from group_info */
+	struct rb_node			efd_node;
+
+	/* group which free block extent belongs */
+	ext4_group_t			efd_group;
+
+	/* free block extent */
+	ext4_grpblk_t			efd_start_cluster;
+	ext4_grpblk_t			efd_count;
+
+	/* transaction which freed this extent */
+	tid_t				efd_tid;
+};
+
+struct ext4_prealloc_space {
+	struct list_head	pa_inode_list;
+	struct list_head	pa_group_list;
+	union {
+		struct list_head pa_tmp_list;
+		struct rcu_head	pa_rcu;
+	} u;
+	spinlock_t		pa_lock;
+	atomic_t		pa_count;
+	unsigned		pa_deleted;
+	ext4_fsblk_t		pa_pstart;	/* phys. block */
+	ext4_lblk_t		pa_lstart;	/* log. block */
+	ext4_grpblk_t		pa_len;		/* len of preallocated chunk */
+	ext4_grpblk_t		pa_free;	/* how many blocks are free */
+	unsigned short		pa_type;	/* pa type. inode or group */
+	spinlock_t		*pa_obj_lock;
+	struct inode		*pa_inode;	/* hack, for history only */
+};
+
+enum {
+	MB_INODE_PA = 0,
+	MB_GROUP_PA = 1
+};
+
+struct ext4_free_extent {
+	ext4_lblk_t fe_logical;
+	ext4_grpblk_t fe_start;	/* In cluster units */
+	ext4_group_t fe_group;
+	ext4_grpblk_t fe_len;	/* In cluster units */
+};
+
+/*
+ * Locality group:
+ *   we try to group all related changes together
+ *   so that writeback can flush/allocate them together as well
+ *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
+ *   (512). We store prealloc space into the hash based on the pa_free blocks
+ *   order value.ie, fls(pa_free)-1;
+ */
+#define PREALLOC_TB_SIZE 10
+struct ext4_locality_group {
+	/* for allocator */
+	/* to serialize allocates */
+	struct mutex		lg_mutex;
+	/* list of preallocations */
+	struct list_head	lg_prealloc_list[PREALLOC_TB_SIZE];
+	spinlock_t		lg_prealloc_lock;
+};
+
+struct ext4_allocation_context {
+	struct inode *ac_inode;
+	struct super_block *ac_sb;
+
+	/* original request */
+	struct ext4_free_extent ac_o_ex;
+
+	/* goal request (normalized ac_o_ex) */
+	struct ext4_free_extent ac_g_ex;
+
+	/* the best found extent */
+	struct ext4_free_extent ac_b_ex;
+
+	/* copy of the best found extent taken before preallocation efforts */
+	struct ext4_free_extent ac_f_ex;
+
+	__u16 ac_groups_scanned;
+	__u16 ac_found;
+	__u16 ac_tail;
+	__u16 ac_buddy;
+	__u16 ac_flags;		/* allocation hints */
+	__u8 ac_status;
+	__u8 ac_criteria;
+	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
+				 * N > 0, the field stores N, otherwise 0 */
+	__u8 ac_op;		/* operation, for history only */
+	struct page *ac_bitmap_page;
+	struct page *ac_buddy_page;
+	struct ext4_prealloc_space *ac_pa;
+	struct ext4_locality_group *ac_lg;
+};
+
+#define AC_STATUS_CONTINUE	1
+#define AC_STATUS_FOUND		2
+#define AC_STATUS_BREAK		3
+
+struct ext4_buddy {
+	struct page *bd_buddy_page;
+	void *bd_buddy;
+	struct page *bd_bitmap_page;
+	void *bd_bitmap;
+	struct ext4_group_info *bd_info;
+	struct super_block *bd_sb;
+	__u16 bd_blkbits;
+	ext4_group_t bd_group;
+};
+
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+					struct ext4_free_extent *fex)
+{
+	return ext4_group_first_block_no(sb, fex->fe_group) +
+		(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
+}
+#endif
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 5c1e27de775..ec092437d3e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -12,21 +12,21 @@
  *
  */
 
-#include <linux/module.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs_extents.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
 
 /*
  * The contiguous blocks details which can be
  * represented by a single extent
  */
-struct list_blocks_struct {
-	ext4_lblk_t first_block, last_block;
+struct migrate_struct {
+	ext4_lblk_t first_block, last_block, curr_block;
 	ext4_fsblk_t first_pblock, last_pblock;
 };
 
 static int finish_range(handle_t *handle, struct inode *inode,
-				struct list_blocks_struct *lb)
+				struct migrate_struct *lb)
 
 {
 	int retval = 0, needed;
@@ -39,7 +39,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
 	newext.ee_block = cpu_to_le32(lb->first_block);
 	newext.ee_len   = cpu_to_le16(lb->last_block - lb->first_block + 1);
 	ext4_ext_store_pblock(&newext, lb->first_pblock);
-	path = ext4_ext_find_extent(inode, lb->first_block, NULL);
+	path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
 
 	if (IS_ERR(path)) {
 		retval = PTR_ERR(path);
@@ -53,12 +53,14 @@ static int finish_range(handle_t *handle, struct inode *inode,
 	 * credit. But below we try to not accumalate too much
 	 * of them by restarting the journal.
 	 */
-	needed = ext4_ext_calc_credits_for_insert(inode, path);
+	needed = ext4_ext_calc_credits_for_single_extent(inode,
+		    lb->last_block - lb->first_block + 1, path);
 
 	/*
 	 * Make sure the credit we accumalated is not really high
 	 */
-	if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+	if (needed && ext4_handle_has_enough_credits(handle,
+						EXT4_RESERVE_TRANS_BLOCKS)) {
 		retval = ext4_journal_restart(handle, needed);
 		if (retval)
 			goto err_out;
@@ -73,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
 				goto err_out;
 		}
 	}
-	retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+	retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
 err_out:
 	if (path) {
 		ext4_ext_drop_refs(path);
@@ -84,8 +86,7 @@ err_out:
 }
 
 static int update_extent_range(handle_t *handle, struct inode *inode,
-				ext4_fsblk_t pblock, ext4_lblk_t blk_num,
-				struct list_blocks_struct *lb)
+			       ext4_fsblk_t pblock, struct migrate_struct *lb)
 {
 	int retval;
 	/*
@@ -93,9 +94,10 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
 	 */
 	if (lb->first_pblock &&
 		(lb->last_pblock+1 == pblock) &&
-		(lb->last_block+1 == blk_num)) {
+		(lb->last_block+1 == lb->curr_block)) {
 		lb->last_pblock = pblock;
-		lb->last_block = blk_num;
+		lb->last_block = lb->curr_block;
+		lb->curr_block++;
 		return 0;
 	}
 	/*
@@ -103,64 +105,49 @@ static int update_extent_range(handle_t *handle, struct inode *inode,
 	 */
 	retval = finish_range(handle, inode, lb);
 	lb->first_pblock = lb->last_pblock = pblock;
-	lb->first_block = lb->last_block = blk_num;
-
+	lb->first_block = lb->last_block = lb->curr_block;
+	lb->curr_block++;
 	return retval;
 }
 
 static int update_ind_extent_range(handle_t *handle, struct inode *inode,
-				   ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
-				   struct list_blocks_struct *lb)
+				   ext4_fsblk_t pblock,
+				   struct migrate_struct *lb)
 {
 	struct buffer_head *bh;
 	__le32 *i_data;
 	int i, retval = 0;
-	ext4_lblk_t blk_count = *blk_nump;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	if (!pblock) {
-		/* Only update the file block number */
-		*blk_nump += max_entries;
-		return 0;
-	}
-
 	bh = sb_bread(inode->i_sb, pblock);
 	if (!bh)
 		return -EIO;
 
 	i_data = (__le32 *)bh->b_data;
-	for (i = 0; i < max_entries; i++, blk_count++) {
+	for (i = 0; i < max_entries; i++) {
 		if (i_data[i]) {
 			retval = update_extent_range(handle, inode,
-						le32_to_cpu(i_data[i]),
-						blk_count, lb);
+						le32_to_cpu(i_data[i]), lb);
 			if (retval)
 				break;
+		} else {
+			lb->curr_block++;
 		}
 	}
-
-	/* Update the file block number */
-	*blk_nump = blk_count;
 	put_bh(bh);
 	return retval;
 
 }
 
 static int update_dind_extent_range(handle_t *handle, struct inode *inode,
-				    ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
-				    struct list_blocks_struct *lb)
+				    ext4_fsblk_t pblock,
+				    struct migrate_struct *lb)
 {
 	struct buffer_head *bh;
 	__le32 *i_data;
 	int i, retval = 0;
-	ext4_lblk_t blk_count = *blk_nump;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	if (!pblock) {
-		/* Only update the file block number */
-		*blk_nump += max_entries * max_entries;
-		return 0;
-	}
 	bh = sb_bread(inode->i_sb, pblock);
 	if (!bh)
 		return -EIO;
@@ -169,38 +156,28 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
 	for (i = 0; i < max_entries; i++) {
 		if (i_data[i]) {
 			retval = update_ind_extent_range(handle, inode,
-						le32_to_cpu(i_data[i]),
-						&blk_count, lb);
+						le32_to_cpu(i_data[i]), lb);
 			if (retval)
 				break;
 		} else {
 			/* Only update the file block number */
-			blk_count += max_entries;
+			lb->curr_block += max_entries;
 		}
 	}
-
-	/* Update the file block number */
-	*blk_nump = blk_count;
 	put_bh(bh);
 	return retval;
 
 }
 
 static int update_tind_extent_range(handle_t *handle, struct inode *inode,
-				     ext4_fsblk_t pblock, ext4_lblk_t *blk_nump,
-				     struct list_blocks_struct *lb)
+				    ext4_fsblk_t pblock,
+				    struct migrate_struct *lb)
 {
 	struct buffer_head *bh;
 	__le32 *i_data;
 	int i, retval = 0;
-	ext4_lblk_t blk_count = *blk_nump;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	if (!pblock) {
-		/* Only update the file block number */
-		*blk_nump += max_entries * max_entries * max_entries;
-		return 0;
-	}
 	bh = sb_bread(inode->i_sb, pblock);
 	if (!bh)
 		return -EIO;
@@ -209,16 +186,14 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
 	for (i = 0; i < max_entries; i++) {
 		if (i_data[i]) {
 			retval = update_dind_extent_range(handle, inode,
-						le32_to_cpu(i_data[i]),
-						&blk_count, lb);
+						le32_to_cpu(i_data[i]), lb);
 			if (retval)
 				break;
-		} else
+		} else {
 			/* Only update the file block number */
-			blk_count += max_entries * max_entries;
+			lb->curr_block += max_entries * max_entries;
+		}
 	}
-	/* Update the file block number */
-	*blk_nump = blk_count;
 	put_bh(bh);
 	return retval;
 
@@ -228,7 +203,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
 {
 	int retval = 0, needed;
 
-	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
 		return 0;
 	/*
 	 * We are freeing a blocks. During this we touch
@@ -236,7 +211,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
 	 * So allocate a credit of 3. We may update
 	 * quota (user and group).
 	 */
-	needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+	needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
 
 	if (ext4_journal_extend(handle, needed) != 0)
 		retval = ext4_journal_restart(handle, needed);
@@ -260,13 +235,17 @@ static int free_dind_blocks(handle_t *handle,
 	for (i = 0; i < max_entries; i++) {
 		if (tmp_idata[i]) {
 			extend_credit_for_blkdel(handle, inode);
-			ext4_free_blocks(handle, inode,
-					le32_to_cpu(tmp_idata[i]), 1, 1);
+			ext4_free_blocks(handle, inode, NULL,
+					 le32_to_cpu(tmp_idata[i]), 1,
+					 EXT4_FREE_BLOCKS_METADATA |
+					 EXT4_FREE_BLOCKS_FORGET);
 		}
 	}
 	put_bh(bh);
 	extend_credit_for_blkdel(handle, inode);
-	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+			 EXT4_FREE_BLOCKS_METADATA |
+			 EXT4_FREE_BLOCKS_FORGET);
 	return 0;
 }
 
@@ -295,7 +274,9 @@ static int free_tind_blocks(handle_t *handle,
 	}
 	put_bh(bh);
 	extend_credit_for_blkdel(handle, inode);
-	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+	ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
+			 EXT4_FREE_BLOCKS_METADATA |
+			 EXT4_FREE_BLOCKS_FORGET);
 	return 0;
 }
 
@@ -306,8 +287,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
 	/* ei->i_data[EXT4_IND_BLOCK] */
 	if (i_data[0]) {
 		extend_credit_for_blkdel(handle, inode);
-		ext4_free_blocks(handle, inode,
-				le32_to_cpu(i_data[0]), 1, 1);
+		ext4_free_blocks(handle, inode, NULL,
+				le32_to_cpu(i_data[0]), 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
 	}
 
 	/* ei->i_data[EXT4_DIND_BLOCK] */
@@ -327,7 +310,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
 }
 
 static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
-				struct inode *tmp_inode)
+						struct inode *tmp_inode)
 {
 	int retval;
 	__le32	i_data[3];
@@ -339,7 +322,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 	 * i_data field of the original inode
 	 */
 	retval = ext4_journal_extend(handle, 1);
-	if (retval != 0) {
+	if (retval) {
 		retval = ext4_journal_restart(handle, 1);
 		if (retval)
 			goto err_out;
@@ -351,10 +334,21 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	/*
+	 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
+	 * happened after we started the migrate. We need to
+	 * fail the migrate
+	 */
+	if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
+		retval = -EAGAIN;
+		up_write(&EXT4_I(inode)->i_data_sem);
+		goto err_out;
+	} else
+		ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+	/*
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
 	 */
-	ei->i_flags |= EXT4_EXTENTS_FL;
+	ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
 	memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
 
 	/*
@@ -390,7 +384,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
 	struct buffer_head *bh;
 	struct ext4_extent_header *eh;
 
-	block = idx_pblock(ix);
+	block = ext4_idx_pblock(ix);
 	bh = sb_bread(inode->i_sb, block);
 	if (!bh)
 		return -EIO;
@@ -406,7 +400,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
 	}
 	put_bh(bh);
 	extend_credit_for_blkdel(handle, inode);
-	ext4_free_blocks(handle, inode, block, 1, 1);
+	ext4_free_blocks(handle, inode, NULL, block, 1,
+			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
 	return retval;
 }
 
@@ -431,28 +426,27 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
 			return retval;
 	}
 	return retval;
-
 }
 
-int ext4_ext_migrate(struct inode *inode, struct file *filp,
-				unsigned int cmd, unsigned long arg)
+int ext4_ext_migrate(struct inode *inode)
 {
 	handle_t *handle;
 	int retval = 0, i;
 	__le32 *i_data;
-	ext4_lblk_t blk_count = 0;
 	struct ext4_inode_info *ei;
 	struct inode *tmp_inode = NULL;
-	struct list_blocks_struct lb;
+	struct migrate_struct lb;
 	unsigned long max_entries;
+	__u32 goal;
+	uid_t owner[2];
 
-	if (!test_opt(inode->i_sb, EXTENTS))
-		/*
-		 * if mounted with noextents we don't allow the migrate
-		 */
-		return -EINVAL;
-
-	if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	/*
+	 * If the filesystem does not support extents, or the inode
+	 * already is extent-based, error out.
+	 */
+	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	    (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EINVAL;
 
 	if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
@@ -461,34 +455,35 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 		 */
 		return retval;
 
-	handle = ext4_journal_start(inode,
-					EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
-					+ 1);
+	/*
+	 * Worst case we can touch the allocation bitmaps, a bgd
+	 * block, and a block to link in the orphan list.  We do need
+	 * need to worry about credits for modifying the quota inode.
+	 */
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
+		4 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
+
 	if (IS_ERR(handle)) {
 		retval = PTR_ERR(handle);
-		goto err_out;
+		return retval;
 	}
-	tmp_inode = ext4_new_inode(handle,
-				inode->i_sb->s_root->d_inode,
-				S_IFREG);
+	goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+		EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+	owner[0] = i_uid_read(inode);
+	owner[1] = i_gid_read(inode);
+	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+				   S_IFREG, NULL, goal, owner);
 	if (IS_ERR(tmp_inode)) {
-		retval = -ENOMEM;
+		retval = PTR_ERR(tmp_inode);
 		ext4_journal_stop(handle);
-		tmp_inode = NULL;
-		goto err_out;
+		return retval;
 	}
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
-	 * We don't want the inode to be reclaimed
-	 * if we got interrupted in between. We have
-	 * this tmp inode carrying reference to the
-	 * data blocks of the original file. We set
-	 * the i_nlink to zero at the last stage after
-	 * switching the original file to extent format
+	 * Set the i_nlink to zero so it will be deleted later
+	 * when we drop inode reference.
 	 */
-	tmp_inode->i_nlink = 1;
+	clear_nlink(tmp_inode);
 
 	ext4_ext_tree_init(handle, tmp_inode);
 	ext4_orphan_add(handle, tmp_inode);
@@ -498,17 +493,33 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 	 * start with one credit accounted for
 	 * superblock modification.
 	 *
-	 * For the tmp_inode we already have commited the
-	 * trascation that created the inode. Later as and
+	 * For the tmp_inode we already have committed the
+	 * transaction that created the inode. Later as and
 	 * when we add extents we extent the journal
 	 */
 	/*
-	 * inode_mutex prevent write and truncate on the file. Read still goes
-	 * through. We take i_data_sem in ext4_ext_swap_inode_data before we
-	 * switch the inode format to prevent read.
+	 * Even though we take i_mutex we can still cause block
+	 * allocation via mmap write to holes. If we have allocated
+	 * new blocks we fail migrate.  New block allocation will
+	 * clear EXT4_STATE_EXT_MIGRATE flag.  The flag is updated
+	 * with i_data_sem held to prevent racing with block
+	 * allocation.
 	 */
-	mutex_lock(&(inode->i_mutex));
-	handle = ext4_journal_start(inode, 1);
+	down_read(&EXT4_I(inode)->i_data_sem);
+	ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+	up_read((&EXT4_I(inode)->i_data_sem));
+
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+	if (IS_ERR(handle)) {
+		/*
+		 * It is impossible to update on-disk structures without
+		 * a handle, so just rollback in-core changes and live other
+		 * work to orphan_list_cleanup()
+		 */
+		ext4_orphan_del(NULL, tmp_inode);
+		retval = PTR_ERR(handle);
+		goto out;
+	}
 
 	ei = EXT4_I(inode);
 	i_data = ei->i_data;
@@ -516,35 +527,32 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 
 	/* 32 bit block address 4 bytes */
 	max_entries = inode->i_sb->s_blocksize >> 2;
-	for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
+	for (i = 0; i < EXT4_NDIR_BLOCKS; i++) {
 		if (i_data[i]) {
 			retval = update_extent_range(handle, tmp_inode,
-						le32_to_cpu(i_data[i]),
-						blk_count, &lb);
+						le32_to_cpu(i_data[i]), &lb);
 			if (retval)
 				goto err_out;
-		}
+		} else
+			lb.curr_block++;
 	}
 	if (i_data[EXT4_IND_BLOCK]) {
 		retval = update_ind_extent_range(handle, tmp_inode,
-					le32_to_cpu(i_data[EXT4_IND_BLOCK]),
-					&blk_count, &lb);
+				le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb);
 			if (retval)
 				goto err_out;
 	} else
-		blk_count +=  max_entries;
+		lb.curr_block += max_entries;
 	if (i_data[EXT4_DIND_BLOCK]) {
 		retval = update_dind_extent_range(handle, tmp_inode,
-					le32_to_cpu(i_data[EXT4_DIND_BLOCK]),
-					&blk_count, &lb);
+				le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb);
 			if (retval)
 				goto err_out;
 	} else
-		blk_count += max_entries * max_entries;
+		lb.curr_block += max_entries * max_entries;
 	if (i_data[EXT4_TIND_BLOCK]) {
 		retval = update_tind_extent_range(handle, tmp_inode,
-					le32_to_cpu(i_data[EXT4_TIND_BLOCK]),
-					&blk_count, &lb);
+				le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb);
 			if (retval)
 				goto err_out;
 	}
@@ -559,9 +567,15 @@ err_out:
 		 * tmp_inode
 		 */
 		free_ext_block(handle, tmp_inode);
-	else
-		retval = ext4_ext_swap_inode_data(handle, inode,
-							tmp_inode);
+	else {
+		retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode);
+		if (retval)
+			/*
+			 * if we fail to swap inode data free the extent
+			 * details of the tmp inode
+			 */
+			free_ext_block(handle, tmp_inode);
+	}
 
 	/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
 	if (ext4_journal_extend(handle, 1) != 0)
@@ -584,19 +598,71 @@ err_out:
 
 	/* Reset the extent details */
 	ext4_ext_tree_init(handle, tmp_inode);
+	ext4_journal_stop(handle);
+out:
+	unlock_new_inode(tmp_inode);
+	iput(tmp_inode);
 
-	/*
-	 * Set the i_nlink to zero so that
-	 * generic_drop_inode really deletes the
-	 * inode
-	 */
-	tmp_inode->i_nlink = 0;
+	return retval;
+}
 
-	ext4_journal_stop(handle);
-	mutex_unlock(&(inode->i_mutex));
+/*
+ * Migrate a simple extent-based inode to use the i_blocks[] array
+ */
+int ext4_ind_migrate(struct inode *inode)
+{
+	struct ext4_extent_header	*eh;
+	struct ext4_super_block		*es = EXT4_SB(inode->i_sb)->s_es;
+	struct ext4_inode_info		*ei = EXT4_I(inode);
+	struct ext4_extent		*ex;
+	unsigned int			i, len;
+	ext4_fsblk_t			blk;
+	handle_t			*handle;
+	int				ret;
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	    (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return -EINVAL;
 
-	if (tmp_inode)
-		iput(tmp_inode);
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+		return -EOPNOTSUPP;
 
-	return retval;
+	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ret = ext4_ext_check_inode(inode);
+	if (ret)
+		goto errout;
+
+	eh = ext_inode_hdr(inode);
+	ex  = EXT_FIRST_EXTENT(eh);
+	if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
+	    eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
+		ret = -EOPNOTSUPP;
+		goto errout;
+	}
+	if (eh->eh_entries == 0)
+		blk = len = 0;
+	else {
+		len = le16_to_cpu(ex->ee_len);
+		blk = ext4_ext_pblock(ex);
+		if (len > EXT4_NDIR_BLOCKS) {
+			ret = -EOPNOTSUPP;
+			goto errout;
+		}
+	}
+
+	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	for (i=0; i < len; i++)
+		ei->i_data[i] = cpu_to_le32(blk++);
+	ext4_mark_inode_dirty(handle, inode);
+errout:
+	ext4_journal_stop(handle);
+	up_write(&EXT4_I(inode)->i_data_sem);
+	return ret;
 }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 00000000000..32bce844c2e
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,395 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/* Checksumming functions */
+static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int offset = offsetof(struct mmp_struct, mmp_checksum);
+	__u32 csum;
+
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+
+	return cpu_to_le32(csum);
+}
+
+static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
+
+	return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
+}
+
+static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return;
+
+	mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
+}
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
+{
+	struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+
+	/*
+	 * We protect against freezing so that we don't create dirty buffers
+	 * on frozen filesystem.
+	 */
+	sb_start_write(sb);
+	ext4_mmp_csum_set(sb, mmp);
+	mark_buffer_dirty(bh);
+	lock_buffer(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+	wait_on_buffer(bh);
+	sb_end_write(sb);
+	if (unlikely(!buffer_uptodate(bh)))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+			  ext4_fsblk_t mmp_block)
+{
+	struct mmp_struct *mmp;
+
+	if (*bh)
+		clear_buffer_uptodate(*bh);
+
+	/* This would be sb_bread(sb, mmp_block), except we need to be sure
+	 * that the MD RAID device cache has been bypassed, and that the read
+	 * is not blocked in the elevator. */
+	if (!*bh)
+		*bh = sb_getblk(sb, mmp_block);
+	if (!*bh)
+		return -ENOMEM;
+	if (*bh) {
+		get_bh(*bh);
+		lock_buffer(*bh);
+		(*bh)->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
+		wait_on_buffer(*bh);
+		if (!buffer_uptodate(*bh)) {
+			brelse(*bh);
+			*bh = NULL;
+		}
+	}
+	if (unlikely(!*bh)) {
+		ext4_warning(sb, "Error while reading MMP block %llu",
+			     mmp_block);
+		return -EIO;
+	}
+
+	mmp = (struct mmp_struct *)((*bh)->b_data);
+	if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
+	    !ext4_mmp_csum_verify(sb, mmp))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+		    const char *function, unsigned int line, const char *msg)
+{
+	__ext4_warning(sb, function, line, msg);
+	__ext4_warning(sb, function, line,
+		       "MMP failure info: last update time: %llu, last update "
+		       "node: %s, last update device: %s\n",
+		       (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+		       mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+	struct super_block *sb = ((struct mmpd_data *) data)->sb;
+	struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct mmp_struct *mmp;
+	ext4_fsblk_t mmp_block;
+	u32 seq = 0;
+	unsigned long failed_writes = 0;
+	int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned mmp_check_interval;
+	unsigned long last_update_time;
+	unsigned long diff;
+	int retval;
+
+	mmp_block = le64_to_cpu(es->s_mmp_block);
+	mmp = (struct mmp_struct *)(bh->b_data);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+	/*
+	 * Start with the higher mmp_check_interval and reduce it if
+	 * the MMP block is being updated on time.
+	 */
+	mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+				 EXT4_MMP_MIN_CHECK_INTERVAL);
+	mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+	memcpy(mmp->mmp_nodename, init_utsname()->nodename,
+	       sizeof(mmp->mmp_nodename));
+
+	while (!kthread_should_stop()) {
+		if (++seq > EXT4_MMP_SEQ_MAX)
+			seq = 1;
+
+		mmp->mmp_seq = cpu_to_le32(seq);
+		mmp->mmp_time = cpu_to_le64(get_seconds());
+		last_update_time = jiffies;
+
+		retval = write_mmp_block(sb, bh);
+		/*
+		 * Don't spew too many error messages. Print one every
+		 * (s_mmp_update_interval * 60) seconds.
+		 */
+		if (retval) {
+			if ((failed_writes % 60) == 0)
+				ext4_error(sb, "Error writing to MMP block");
+			failed_writes++;
+		}
+
+		if (!(le32_to_cpu(es->s_feature_incompat) &
+		    EXT4_FEATURE_INCOMPAT_MMP)) {
+			ext4_warning(sb, "kmmpd being stopped since MMP feature"
+				     " has been disabled.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		if (sb->s_flags & MS_RDONLY) {
+			ext4_warning(sb, "kmmpd being stopped since filesystem "
+				     "has been remounted as readonly.");
+			EXT4_SB(sb)->s_mmp_tsk = NULL;
+			goto failed;
+		}
+
+		diff = jiffies - last_update_time;
+		if (diff < mmp_update_interval * HZ)
+			schedule_timeout_interruptible(mmp_update_interval *
+						       HZ - diff);
+
+		/*
+		 * We need to make sure that more than mmp_check_interval
+		 * seconds have not passed since writing. If that has happened
+		 * we need to check if the MMP block is as we left it.
+		 */
+		diff = jiffies - last_update_time;
+		if (diff > mmp_check_interval * HZ) {
+			struct buffer_head *bh_check = NULL;
+			struct mmp_struct *mmp_check;
+
+			retval = read_mmp_block(sb, &bh_check, mmp_block);
+			if (retval) {
+				ext4_error(sb, "error reading MMP data: %d",
+					   retval);
+
+				EXT4_SB(sb)->s_mmp_tsk = NULL;
+				goto failed;
+			}
+
+			mmp_check = (struct mmp_struct *)(bh_check->b_data);
+			if (mmp->mmp_seq != mmp_check->mmp_seq ||
+			    memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+				   sizeof(mmp->mmp_nodename))) {
+				dump_mmp_msg(sb, mmp_check,
+					     "Error while updating MMP info. "
+					     "The filesystem seems to have been"
+					     " multiply mounted.");
+				ext4_error(sb, "abort");
+				goto failed;
+			}
+			put_bh(bh_check);
+		}
+
+		 /*
+		 * Adjust the mmp_check_interval depending on how much time
+		 * it took for the MMP block to be written.
+		 */
+		mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+					     EXT4_MMP_MAX_CHECK_INTERVAL),
+					 EXT4_MMP_MIN_CHECK_INTERVAL);
+		mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+	}
+
+	/*
+	 * Unmount seems to be clean.
+	 */
+	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+	mmp->mmp_time = cpu_to_le64(get_seconds());
+
+	retval = write_mmp_block(sb, bh);
+
+failed:
+	kfree(data);
+	brelse(bh);
+	return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+	u32 new_seq;
+
+	do {
+		new_seq = prandom_u32();
+	} while (new_seq > EXT4_MMP_SEQ_MAX);
+
+	return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+				    ext4_fsblk_t mmp_block)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct buffer_head *bh = NULL;
+	struct mmp_struct *mmp = NULL;
+	struct mmpd_data *mmpd_data;
+	u32 seq;
+	unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+	unsigned int wait_time = 0;
+	int retval;
+
+	if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+	    mmp_block >= ext4_blocks_count(es)) {
+		ext4_warning(sb, "Invalid MMP block in superblock");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+
+	mmp = (struct mmp_struct *)(bh->b_data);
+
+	if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+	/*
+	 * If check_interval in MMP block is larger, use that instead of
+	 * update_interval from the superblock.
+	 */
+	if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
+		mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
+
+	seq = le32_to_cpu(mmp->mmp_seq);
+	if (seq == EXT4_MMP_SEQ_CLEAN)
+		goto skip;
+
+	if (seq == EXT4_MMP_SEQ_FSCK) {
+		dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+		goto failed;
+	}
+
+	wait_time = min(mmp_check_interval * 2 + 1,
+			mmp_check_interval + 60);
+
+	/* Print MMP interval if more than 20 secs. */
+	if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+		ext4_warning(sb, "MMP interval %u higher than expected, please"
+			     " wait.\n", wait_time * 2);
+
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+skip:
+	/*
+	 * write a new random sequence number.
+	 */
+	seq = mmp_new_seq();
+	mmp->mmp_seq = cpu_to_le32(seq);
+
+	retval = write_mmp_block(sb, bh);
+	if (retval)
+		goto failed;
+
+	/*
+	 * wait for MMP interval and check mmp_seq.
+	 */
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+		goto failed;
+	}
+
+	retval = read_mmp_block(sb, &bh, mmp_block);
+	if (retval)
+		goto failed;
+	mmp = (struct mmp_struct *)(bh->b_data);
+	if (seq != le32_to_cpu(mmp->mmp_seq)) {
+		dump_mmp_msg(sb, mmp,
+			     "Device is already active on another node.");
+		goto failed;
+	}
+
+	mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+	if (!mmpd_data) {
+		ext4_warning(sb, "not enough memory for mmpd_data");
+		goto failed;
+	}
+	mmpd_data->sb = sb;
+	mmpd_data->bh = bh;
+
+	/*
+	 * Start a kernel thread to update the MMP block periodically.
+	 */
+	EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+					     bdevname(bh->b_bdev,
+						      mmp->mmp_bdevname));
+	if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+		EXT4_SB(sb)->s_mmp_tsk = NULL;
+		kfree(mmpd_data);
+		ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+			     sb->s_id);
+		goto failed;
+	}
+
+	return 0;
+
+failed:
+	brelse(bh);
+	return 1;
+}
+
+
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 00000000000..2484c7ec6a7
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1505 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ *            Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "ext4_extents.h"
+
+/**
+ * get_ext_path - Find an extent path for designated logical block number.
+ *
+ * @inode:	an inode which is searched
+ * @lblock:	logical block number to find an extent path
+ * @path:	pointer to an extent path pointer (for output)
+ *
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+		struct ext4_ext_path **orig_path)
+{
+	int ret = 0;
+	struct ext4_ext_path *path;
+
+	path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
+	if (IS_ERR(path))
+		ret = PTR_ERR(path);
+	else if (path[ext_depth(inode)].p_ext == NULL)
+		ret = -ENODATA;
+	else
+		*orig_path = path;
+
+	return ret;
+}
+
+/**
+ * copy_extent_status - Copy the extent's initialization status
+ *
+ * @src:	an extent for getting initialize status
+ * @dest:	an extent to be set the status
+ */
+static void
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+	if (ext4_ext_is_unwritten(src))
+		ext4_ext_mark_unwritten(dest);
+	else
+		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+
+/**
+ * mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode:	inode which is searched
+ * @path:	this will obtain data for the next extent
+ * @extent:	pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+int
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+		      struct ext4_extent **extent)
+{
+	struct ext4_extent_header *eh;
+	int ppos, leaf_ppos = path->p_depth;
+
+	ppos = leaf_ppos;
+	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+		/* leaf block */
+		*extent = ++path[ppos].p_ext;
+		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
+		return 0;
+	}
+
+	while (--ppos >= 0) {
+		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+		    path[ppos].p_idx) {
+			int cur_ppos = ppos;
+
+			/* index block */
+			path[ppos].p_idx++;
+			path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
+			if (path[ppos+1].p_bh)
+				brelse(path[ppos+1].p_bh);
+			path[ppos+1].p_bh =
+				sb_bread(inode->i_sb, path[ppos].p_block);
+			if (!path[ppos+1].p_bh)
+				return -EIO;
+			path[ppos+1].p_hdr =
+				ext_block_hdr(path[ppos+1].p_bh);
+
+			/* Halfway index block */
+			while (++cur_ppos < leaf_ppos) {
+				path[cur_ppos].p_idx =
+					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+				path[cur_ppos].p_block =
+					ext4_idx_pblock(path[cur_ppos].p_idx);
+				if (path[cur_ppos+1].p_bh)
+					brelse(path[cur_ppos+1].p_bh);
+				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+					path[cur_ppos].p_block);
+				if (!path[cur_ppos+1].p_bh)
+					return -EIO;
+				path[cur_ppos+1].p_hdr =
+					ext_block_hdr(path[cur_ppos+1].p_bh);
+			}
+
+			path[leaf_ppos].p_ext = *extent = NULL;
+
+			eh = path[leaf_ppos].p_hdr;
+			if (le16_to_cpu(eh->eh_entries) == 0)
+				/* empty leaf is found */
+				return -ENODATA;
+
+			/* leaf block */
+			path[leaf_ppos].p_ext = *extent =
+				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+			path[leaf_ppos].p_block =
+					ext4_ext_pblock(path[leaf_ppos].p_ext);
+			return 0;
+		}
+	}
+	/* We found the last extent */
+	return 1;
+}
+
+/**
+ * ext4_double_down_write_data_sem - Acquire two inodes' write lock
+ *                                   of i_data_sem
+ *
+ * Acquire write lock of i_data_sem of the two inodes
+ */
+void
+ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
+{
+	if (first < second) {
+		down_write(&EXT4_I(first)->i_data_sem);
+		down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+	} else {
+		down_write(&EXT4_I(second)->i_data_sem);
+		down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
+
+	}
+}
+
+/**
+ * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ *
+ * @orig_inode:		original inode structure to be released its lock first
+ * @donor_inode:	donor inode structure to be released its lock second
+ * Release write lock of i_data_sem of two inodes (orig and donor).
+ */
+void
+ext4_double_up_write_data_sem(struct inode *orig_inode,
+			      struct inode *donor_inode)
+{
+	up_write(&EXT4_I(orig_inode)->i_data_sem);
+	up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @o_start:		first original extent to be changed
+ * @o_end:		last original extent to be changed
+ * @start_ext:		first new extent to be inserted
+ * @new_ext:		middle of new extent to be inserted
+ * @end_ext:		last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+		struct ext4_extent *o_start, struct ext4_extent *o_end,
+		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+		struct ext4_extent *end_ext)
+{
+	struct ext4_ext_path *orig_path = NULL;
+	ext4_lblk_t eblock = 0;
+	int new_flag = 0;
+	int end_flag = 0;
+	int err = 0;
+
+	if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+		if (o_start == o_end) {
+
+			/*       start_ext   new_ext    end_ext
+			 * donor |---------|-----------|--------|
+			 * orig  |------------------------------|
+			 */
+			end_flag = 1;
+		} else {
+
+			/*       start_ext   new_ext   end_ext
+			 * donor |---------|----------|---------|
+			 * orig  |---------------|--------------|
+			 */
+			o_end->ee_block = end_ext->ee_block;
+			o_end->ee_len = end_ext->ee_len;
+			ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
+		}
+
+		o_start->ee_len = start_ext->ee_len;
+		eblock = le32_to_cpu(start_ext->ee_block);
+		new_flag = 1;
+
+	} else if (start_ext->ee_len && new_ext->ee_len &&
+		   !end_ext->ee_len && o_start == o_end) {
+
+		/*	 start_ext	new_ext
+		 * donor |--------------|---------------|
+		 * orig  |------------------------------|
+		 */
+		o_start->ee_len = start_ext->ee_len;
+		eblock = le32_to_cpu(start_ext->ee_block);
+		new_flag = 1;
+
+	} else if (!start_ext->ee_len && new_ext->ee_len &&
+		   end_ext->ee_len && o_start == o_end) {
+
+		/*	  new_ext	end_ext
+		 * donor |--------------|---------------|
+		 * orig  |------------------------------|
+		 */
+		o_end->ee_block = end_ext->ee_block;
+		o_end->ee_len = end_ext->ee_len;
+		ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
+
+		/*
+		 * Set 0 to the extent block if new_ext was
+		 * the first block.
+		 */
+		if (new_ext->ee_block)
+			eblock = le32_to_cpu(new_ext->ee_block);
+
+		new_flag = 1;
+	} else {
+		ext4_debug("ext4 move extent: Unexpected insert case\n");
+		return -EIO;
+	}
+
+	if (new_flag) {
+		err = get_ext_path(orig_inode, eblock, &orig_path);
+		if (err)
+			goto out;
+
+		if (ext4_ext_insert_extent(handle, orig_inode,
+					orig_path, new_ext, 0))
+			goto out;
+	}
+
+	if (end_flag) {
+		err = get_ext_path(orig_inode,
+				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+		if (err)
+			goto out;
+
+		if (ext4_ext_insert_extent(handle, orig_inode,
+					   orig_path, end_ext, 0))
+			goto out;
+	}
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+
+	return err;
+
+}
+
+/**
+ * mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start:		first original extent to be moved
+ * @o_end:		last original extent to be moved
+ * @start_ext:		first new extent to be inserted
+ * @new_ext:		middle of new extent to be inserted
+ * @end_ext:		last new extent to be inserted
+ * @eh:			extent header of target leaf block
+ * @range_to_move:	used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+mext_insert_inside_block(struct ext4_extent *o_start,
+			      struct ext4_extent *o_end,
+			      struct ext4_extent *start_ext,
+			      struct ext4_extent *new_ext,
+			      struct ext4_extent *end_ext,
+			      struct ext4_extent_header *eh,
+			      int range_to_move)
+{
+	int i = 0;
+	unsigned long len;
+
+	/* Move the existing extents */
+	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+			(unsigned long)(o_end + 1);
+		memmove(o_end + 1 + range_to_move, o_end + 1, len);
+	}
+
+	/* Insert start entry */
+	if (start_ext->ee_len)
+		o_start[i++].ee_len = start_ext->ee_len;
+
+	/* Insert new entry */
+	if (new_ext->ee_len) {
+		o_start[i] = *new_ext;
+		ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
+	}
+
+	/* Insert end entry */
+	if (end_ext->ee_len)
+		o_start[i] = *end_ext;
+
+	/* Increment the total entries counter on the extent block */
+	le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+
+/**
+ * mext_insert_extents - Insert new extent
+ *
+ * @handle:	journal handle
+ * @orig_inode:	original inode
+ * @orig_path:	path indicates first extent to be changed
+ * @o_start:	first original extent to be changed
+ * @o_end:	last original extent to be changed
+ * @start_ext:	first new extent to be inserted
+ * @new_ext:	middle of new extent to be inserted
+ * @end_ext:	last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+			 struct ext4_ext_path *orig_path,
+			 struct ext4_extent *o_start,
+			 struct ext4_extent *o_end,
+			 struct ext4_extent *start_ext,
+			 struct ext4_extent *new_ext,
+			 struct ext4_extent *end_ext)
+{
+	struct  ext4_extent_header *eh;
+	unsigned long need_slots, slots_range;
+	int	range_to_move, depth, ret;
+
+	/*
+	 * The extents need to be inserted
+	 * start_extent + new_extent + end_extent.
+	 */
+	need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+		(new_ext->ee_len ? 1 : 0);
+
+	/* The number of slots between start and end */
+	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+		/ sizeof(struct ext4_extent);
+
+	/* Range to move the end of extent */
+	range_to_move = need_slots - slots_range;
+	depth = orig_path->p_depth;
+	orig_path += depth;
+	eh = orig_path->p_hdr;
+
+	if (depth) {
+		/* Register to journal */
+		BUFFER_TRACE(orig_path->p_bh, "get_write_access");
+		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+		if (ret)
+			return ret;
+	}
+
+	/* Expansion */
+	if (range_to_move > 0 &&
+		(range_to_move > le16_to_cpu(eh->eh_max)
+			- le16_to_cpu(eh->eh_entries))) {
+
+		ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+					o_end, start_ext, new_ext, end_ext);
+		if (ret < 0)
+			return ret;
+	} else
+		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+						end_ext, eh, range_to_move);
+
+	return ext4_ext_dirty(handle, orig_inode, orig_path);
+}
+
+/**
+ * mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @orig_path:		path indicates first extent to be changed
+ * @dext:		donor extent
+ * @from:		start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+		     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+		     ext4_lblk_t *from)
+{
+	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	ext4_lblk_t new_ext_end;
+	int oext_alen, new_ext_alen, end_ext_alen;
+	int depth = ext_depth(orig_inode);
+	int ret;
+
+	start_ext.ee_block = end_ext.ee_block = 0;
+	o_start = o_end = oext = orig_path[depth].p_ext;
+	oext_alen = ext4_ext_get_actual_len(oext);
+	start_ext.ee_len = end_ext.ee_len = 0;
+
+	new_ext.ee_block = cpu_to_le32(*from);
+	ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
+	new_ext.ee_len = dext->ee_len;
+	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+
+	/*
+	 * Case: original extent is first
+	 * oext      |--------|
+	 * new_ext      |--|
+	 * start_ext |--|
+	 */
+	if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block) + oext_alen) {
+		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+					       le32_to_cpu(oext->ee_block));
+		start_ext.ee_block = oext->ee_block;
+		copy_extent_status(oext, &start_ext);
+	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+		prev_ext = oext - 1;
+		/*
+		 * We can merge new_ext into previous extent,
+		 * if these are contiguous and same extent type.
+		 */
+		if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+					       &new_ext)) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le16(
+				ext4_ext_get_actual_len(prev_ext) +
+				new_ext_alen);
+			start_ext.ee_block = oext->ee_block;
+			copy_extent_status(prev_ext, &start_ext);
+			new_ext.ee_len = 0;
+		}
+	}
+
+	/*
+	 * Case: new_ext_end must be less than oext
+	 * oext      |-----------|
+	 * new_ext       |-------|
+	 */
+	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+		EXT4_ERROR_INODE(orig_inode,
+			"new_ext_end(%u) should be less than or equal to "
+			"oext->ee_block(%u) + oext_alen(%d) - 1",
+			new_ext_end, le32_to_cpu(oext->ee_block),
+			oext_alen);
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * Case: new_ext is smaller than original extent
+	 * oext    |---------------|
+	 * new_ext |-----------|
+	 * end_ext             |---|
+	 */
+	if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+		new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+		end_ext.ee_len =
+			cpu_to_le16(le32_to_cpu(oext->ee_block) +
+			oext_alen - 1 - new_ext_end);
+		copy_extent_status(oext, &end_ext);
+		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+		ext4_ext_store_pblock(&end_ext,
+			(ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
+		end_ext.ee_block =
+			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+			oext_alen - end_ext_alen);
+	}
+
+	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+				o_end, &start_ext, &new_ext, &end_ext);
+out:
+	return ret;
+}
+
+/**
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
+ *
+ * @tmp_dext:		the extent that will belong to the original inode
+ * @tmp_oext:		the extent that will belong to the donor inode
+ * @orig_off:		block offset of original inode
+ * @donor_off:		block offset of donor inode
+ * @max_count:		the maximum length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+			      struct ext4_extent *tmp_oext,
+			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+			      ext4_lblk_t max_count)
+{
+	ext4_lblk_t diff, orig_diff;
+	struct ext4_extent dext_old, oext_old;
+
+	BUG_ON(orig_off != donor_off);
+
+	/* original and donor extents have to cover the same block offset */
+	if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+	    le32_to_cpu(tmp_oext->ee_block) +
+			ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+		return -ENODATA;
+
+	if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+	    le32_to_cpu(tmp_dext->ee_block) +
+			ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+		return -ENODATA;
+
+	dext_old = *tmp_dext;
+	oext_old = *tmp_oext;
+
+	/* When tmp_dext is too large, pick up the target range. */
+	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+
+	ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
+	le32_add_cpu(&tmp_dext->ee_block, diff);
+	le16_add_cpu(&tmp_dext->ee_len, -diff);
+
+	if (max_count < ext4_ext_get_actual_len(tmp_dext))
+		tmp_dext->ee_len = cpu_to_le16(max_count);
+
+	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+	ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
+
+	/* Adjust extent length if donor extent is larger than orig */
+	if (ext4_ext_get_actual_len(tmp_dext) >
+	    ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+		tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+						orig_diff);
+
+	tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+
+	copy_extent_status(&oext_old, tmp_dext);
+	copy_extent_status(&dext_old, tmp_oext);
+
+	return 0;
+}
+
+/**
+ * mext_check_coverage - Check that all extents in range has the same type
+ *
+ * @inode:		inode in question
+ * @from:		block offset of inode
+ * @count:		block count to be checked
+ * @unwritten:		extents expected to be unwritten
+ * @err:		pointer to save error value
+ *
+ * Return 1 if all extents in range has expected type, and zero otherwise.
+ */
+static int
+mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
+		    int unwritten, int *err)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ext;
+	int ret = 0;
+	ext4_lblk_t last = from + count;
+	while (from < last) {
+		*err = get_ext_path(inode, from, &path);
+		if (*err)
+			goto out;
+		ext = path[ext_depth(inode)].p_ext;
+		if (unwritten != ext4_ext_is_unwritten(ext))
+			goto out;
+		from += ext4_ext_get_actual_len(ext);
+		ext4_ext_drop_refs(path);
+	}
+	ret = 1;
+out:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	return ret;
+}
+
+/**
+ * mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle:		journal handle
+ * @orig_inode:		original inode
+ * @donor_inode:	donor inode
+ * @from:		block offset of orig_inode
+ * @count:		block count to be replaced
+ * @err:		pointer to save return value
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ *    dummy extents.
+ * 2. Change the block information of original inode to point at the
+ *    donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ *    original inode blocks in the dummy extents.
+ *
+ * Return replaced block count.
+ */
+static int
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+			   struct inode *donor_inode, ext4_lblk_t from,
+			   ext4_lblk_t count, int *err)
+{
+	struct ext4_ext_path *orig_path = NULL;
+	struct ext4_ext_path *donor_path = NULL;
+	struct ext4_extent *oext, *dext;
+	struct ext4_extent tmp_dext, tmp_oext;
+	ext4_lblk_t orig_off = from, donor_off = from;
+	int depth;
+	int replaced_count = 0;
+	int dext_alen;
+
+	*err = ext4_es_remove_extent(orig_inode, from, count);
+	if (*err)
+		goto out;
+
+	*err = ext4_es_remove_extent(donor_inode, from, count);
+	if (*err)
+		goto out;
+
+	/* Get the original extent for the block "orig_off" */
+	*err = get_ext_path(orig_inode, orig_off, &orig_path);
+	if (*err)
+		goto out;
+
+	/* Get the donor extent for the head */
+	*err = get_ext_path(donor_inode, donor_off, &donor_path);
+	if (*err)
+		goto out;
+	depth = ext_depth(orig_inode);
+	oext = orig_path[depth].p_ext;
+	tmp_oext = *oext;
+
+	depth = ext_depth(donor_inode);
+	dext = donor_path[depth].p_ext;
+	if (unlikely(!dext))
+		goto missing_donor_extent;
+	tmp_dext = *dext;
+
+	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+				      donor_off, count);
+	if (*err)
+		goto out;
+
+	/* Loop for the donor extents */
+	while (1) {
+		/* The extent for donor must be found. */
+		if (unlikely(!dext)) {
+		missing_donor_extent:
+			EXT4_ERROR_INODE(donor_inode,
+				   "The extent for donor must be found");
+			*err = -EIO;
+			goto out;
+		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+			EXT4_ERROR_INODE(donor_inode,
+				"Donor offset(%u) and the first block of donor "
+				"extent(%u) should be equal",
+				donor_off,
+				le32_to_cpu(tmp_dext.ee_block));
+			*err = -EIO;
+			goto out;
+		}
+
+		/* Set donor extent to orig extent */
+		*err = mext_leaf_block(handle, orig_inode,
+					   orig_path, &tmp_dext, &orig_off);
+		if (*err)
+			goto out;
+
+		/* Set orig extent to donor extent */
+		*err = mext_leaf_block(handle, donor_inode,
+					   donor_path, &tmp_oext, &donor_off);
+		if (*err)
+			goto out;
+
+		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+		replaced_count += dext_alen;
+		donor_off += dext_alen;
+		orig_off += dext_alen;
+
+		BUG_ON(replaced_count > count);
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		if (orig_path)
+			ext4_ext_drop_refs(orig_path);
+		*err = get_ext_path(orig_inode, orig_off, &orig_path);
+		if (*err)
+			goto out;
+		depth = ext_depth(orig_inode);
+		oext = orig_path[depth].p_ext;
+		tmp_oext = *oext;
+
+		if (donor_path)
+			ext4_ext_drop_refs(donor_path);
+		*err = get_ext_path(donor_inode, donor_off, &donor_path);
+		if (*err)
+			goto out;
+		depth = ext_depth(donor_inode);
+		dext = donor_path[depth].p_ext;
+		tmp_dext = *dext;
+
+		*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+					   donor_off, count - replaced_count);
+		if (*err)
+			goto out;
+	}
+
+out:
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+	if (donor_path) {
+		ext4_ext_drop_refs(donor_path);
+		kfree(donor_path);
+	}
+
+	return replaced_count;
+}
+
+/**
+ * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
+ *
+ * @inode1:	the inode structure
+ * @inode2:	the inode structure
+ * @index:	page index
+ * @page:	result page vector
+ *
+ * Grab two locked pages for inode's by inode order
+ */
+static int
+mext_page_double_lock(struct inode *inode1, struct inode *inode2,
+		      pgoff_t index, struct page *page[2])
+{
+	struct address_space *mapping[2];
+	unsigned fl = AOP_FLAG_NOFS;
+
+	BUG_ON(!inode1 || !inode2);
+	if (inode1 < inode2) {
+		mapping[0] = inode1->i_mapping;
+		mapping[1] = inode2->i_mapping;
+	} else {
+		mapping[0] = inode2->i_mapping;
+		mapping[1] = inode1->i_mapping;
+	}
+
+	page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+	if (!page[0])
+		return -ENOMEM;
+
+	page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+	if (!page[1]) {
+		unlock_page(page[0]);
+		page_cache_release(page[0]);
+		return -ENOMEM;
+	}
+	/*
+	 * grab_cache_page_write_begin() may not wait on page's writeback if
+	 * BDI not demand that. But it is reasonable to be very conservative
+	 * here and explicitly wait on page's writeback
+	 */
+	wait_on_page_writeback(page[0]);
+	wait_on_page_writeback(page[1]);
+	if (inode1 > inode2) {
+		struct page *tmp;
+		tmp = page[0];
+		page[0] = page[1];
+		page[1] = tmp;
+	}
+	return 0;
+}
+
+/* Force page buffers uptodate w/o dropping page's lock */
+static int
+mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	sector_t block;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize, block_start, block_end;
+	int i, err,  nr = 0, partial = 0;
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+
+	if (PageUptodate(page))
+		return 0;
+
+	blocksize = 1 << inode->i_blkbits;
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	head = page_buffers(page);
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	for (bh = head, block_start = 0; bh != head || !block_start;
+	     block++, block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+			continue;
+		}
+		if (buffer_uptodate(bh))
+			continue;
+		if (!buffer_mapped(bh)) {
+			err = ext4_get_block(inode, block, bh, 0);
+			if (err) {
+				SetPageError(page);
+				return err;
+			}
+			if (!buffer_mapped(bh)) {
+				zero_user(page, block_start, blocksize);
+				set_buffer_uptodate(bh);
+				continue;
+			}
+		}
+		BUG_ON(nr >= MAX_BUF_PER_PAGE);
+		arr[nr++] = bh;
+	}
+	/* No io required */
+	if (!nr)
+		goto out;
+
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		if (!bh_uptodate_or_lock(bh)) {
+			err = bh_submit_read(bh);
+			if (err)
+				return err;
+		}
+	}
+out:
+	if (!partial)
+		SetPageUptodate(page);
+	return 0;
+}
+
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp:			file structure of original file
+ * @donor_inode:		donor inode
+ * @orig_page_offset:		page index on original file
+ * @data_offset_in_page:	block index where data swapping starts
+ * @block_len_in_page:		the number of blocks to be swapped
+ * @unwritten:			orig extent is unwritten or not
+ * @err:			pointer to save return value
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
+ */
+static int
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+		  pgoff_t orig_page_offset, int data_offset_in_page,
+		  int block_len_in_page, int unwritten, int *err)
+{
+	struct inode *orig_inode = file_inode(o_filp);
+	struct page *pagep[2] = {NULL, NULL};
+	handle_t *handle;
+	ext4_lblk_t orig_blk_offset;
+	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+	unsigned int w_flags = 0;
+	unsigned int tmp_data_size, data_size, replaced_size;
+	int err2, jblocks, retries = 0;
+	int replaced_count = 0;
+	int from = data_offset_in_page << orig_inode->i_blkbits;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and donor_inode may change each different metadata blocks.
+	 */
+again:
+	*err = 0;
+	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+	handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
+	if (IS_ERR(handle)) {
+		*err = PTR_ERR(handle);
+		return 0;
+	}
+
+	if (segment_eq(get_fs(), KERNEL_DS))
+		w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	orig_blk_offset = orig_page_offset * blocks_per_page +
+		data_offset_in_page;
+
+	/* Calculate data_size */
+	if ((orig_blk_offset + block_len_in_page - 1) ==
+	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+		/* Replace the last block */
+		tmp_data_size = orig_inode->i_size & (blocksize - 1);
+		/*
+		 * If data_size equal zero, it shows data_size is multiples of
+		 * blocksize. So we set appropriate value.
+		 */
+		if (tmp_data_size == 0)
+			tmp_data_size = blocksize;
+
+		data_size = tmp_data_size +
+			((block_len_in_page - 1) << orig_inode->i_blkbits);
+	} else
+		data_size = block_len_in_page << orig_inode->i_blkbits;
+
+	replaced_size = data_size;
+
+	*err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
+				     pagep);
+	if (unlikely(*err < 0))
+		goto stop_journal;
+	/*
+	 * If orig extent was unwritten it can become initialized
+	 * at any time after i_data_sem was dropped, in order to
+	 * serialize with delalloc we have recheck extent while we
+	 * hold page's lock, if it is still the case data copy is not
+	 * necessary, just swap data blocks between orig and donor.
+	 */
+	if (unwritten) {
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
+		/* If any of extents in range became initialized we have to
+		 * fallback to data copying */
+		unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
+						block_len_in_page, 1, err);
+		if (*err)
+			goto drop_data_sem;
+
+		unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+						 block_len_in_page, 1, err);
+		if (*err)
+			goto drop_data_sem;
+
+		if (!unwritten) {
+			ext4_double_up_write_data_sem(orig_inode, donor_inode);
+			goto data_copy;
+		}
+		if ((page_has_private(pagep[0]) &&
+		     !try_to_release_page(pagep[0], 0)) ||
+		    (page_has_private(pagep[1]) &&
+		     !try_to_release_page(pagep[1], 0))) {
+			*err = -EBUSY;
+			goto drop_data_sem;
+		}
+		replaced_count = mext_replace_branches(handle, orig_inode,
+						donor_inode, orig_blk_offset,
+						block_len_in_page, err);
+	drop_data_sem:
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
+		goto unlock_pages;
+	}
+data_copy:
+	*err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
+	if (*err)
+		goto unlock_pages;
+
+	/* At this point all buffers in range are uptodate, old mapping layout
+	 * is no longer required, try to drop it now. */
+	if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
+	    (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
+		*err = -EBUSY;
+		goto unlock_pages;
+	}
+
+	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+					       orig_blk_offset,
+					       block_len_in_page, err);
+	if (*err) {
+		if (replaced_count) {
+			block_len_in_page = replaced_count;
+			replaced_size =
+				block_len_in_page << orig_inode->i_blkbits;
+		} else
+			goto unlock_pages;
+	}
+	/* Perform all necessary steps similar write_begin()/write_end()
+	 * but keeping in mind that i_size will not change */
+	*err = __block_write_begin(pagep[0], from, replaced_size,
+				   ext4_get_block);
+	if (!*err)
+		*err = block_commit_write(pagep[0], from, from + replaced_size);
+
+	if (unlikely(*err < 0))
+		goto repair_branches;
+
+	/* Even in case of data=writeback it is reasonable to pin
+	 * inode to transaction, to prevent unexpected data loss */
+	*err = ext4_jbd2_file_inode(handle, orig_inode);
+
+unlock_pages:
+	unlock_page(pagep[0]);
+	page_cache_release(pagep[0]);
+	unlock_page(pagep[1]);
+	page_cache_release(pagep[1]);
+stop_journal:
+	ext4_journal_stop(handle);
+	/* Buffer was busy because probably is pinned to journal transaction,
+	 * force transaction commit may help to free it. */
+	if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb,
+						      &retries))
+		goto again;
+	return replaced_count;
+
+repair_branches:
+	/*
+	 * This should never ever happen!
+	 * Extents are swapped already, but we are not able to copy data.
+	 * Try to swap extents to it's original places
+	 */
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
+	replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
+					       orig_blk_offset,
+					       block_len_in_page, &err2);
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
+	if (replaced_count != block_len_in_page) {
+		EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
+				       "Unable to copy data block,"
+				       " data will be lost.");
+		*err = -EIO;
+	}
+	replaced_count = 0;
+	goto unlock_pages;
+}
+
+/**
+ * mext_check_arguments - Check whether move extent can be done
+ *
+ * @orig_inode:		original inode
+ * @donor_inode:	donor inode
+ * @orig_start:		logical start offset in block for orig
+ * @donor_start:	logical start offset in block for donor
+ * @len:		the number of blocks to be moved
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+		     struct inode *donor_inode, __u64 orig_start,
+		     __u64 donor_start, __u64 *len)
+{
+	ext4_lblk_t orig_blocks, donor_blocks;
+	unsigned int blkbits = orig_inode->i_blkbits;
+	unsigned int blocksize = 1 << blkbits;
+
+	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+		ext4_debug("ext4 move extent: suid or sgid is set"
+			   " to donor file [ino:orig %lu, donor %lu]\n",
+			   orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+		return -EPERM;
+
+	/* Ext4 move extent does not support swapfile */
+	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+		ext4_debug("ext4 move extent: The argument files should "
+			"not be swapfile [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Ext4 move extent supports only extent based file */
+	if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
+		ext4_debug("ext4 move extent: orig file is not extents "
+			"based file [ino:orig %lu]\n", orig_inode->i_ino);
+		return -EOPNOTSUPP;
+	} else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+		ext4_debug("ext4 move extent: donor file is not extents "
+			"based file [ino:donor %lu]\n", donor_inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+		ext4_debug("ext4 move extent: File size is 0 byte\n");
+		return -EINVAL;
+	}
+
+	/* Start offset should be same */
+	if (orig_start != donor_start) {
+		ext4_debug("ext4 move extent: orig and donor's start "
+			"offset are not same [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if ((orig_start >= EXT_MAX_BLOCKS) ||
+	    (*len > EXT_MAX_BLOCKS) ||
+	    (orig_start + *len >= EXT_MAX_BLOCKS))  {
+		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
+			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	if (orig_inode->i_size > donor_inode->i_size) {
+		donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+		/* TODO: eliminate this artificial restriction */
+		if (orig_start >= donor_blocks) {
+			ext4_debug("ext4 move extent: orig start offset "
+			"[%llu] should be less than donor file blocks "
+			"[%u] [ino:orig %lu, donor %lu]\n",
+			orig_start, donor_blocks,
+			orig_inode->i_ino, donor_inode->i_ino);
+			return -EINVAL;
+		}
+
+		/* TODO: eliminate this artificial restriction */
+		if (orig_start + *len > donor_blocks) {
+			ext4_debug("ext4 move extent: End offset [%llu] should "
+				"be less than donor file blocks [%u]."
+				"So adjust length from %llu to %llu "
+				"[ino:orig %lu, donor %lu]\n",
+				orig_start + *len, donor_blocks,
+				*len, donor_blocks - orig_start,
+				orig_inode->i_ino, donor_inode->i_ino);
+			*len = donor_blocks - orig_start;
+		}
+	} else {
+		orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+		if (orig_start >= orig_blocks) {
+			ext4_debug("ext4 move extent: start offset [%llu] "
+				"should be less than original file blocks "
+				"[%u] [ino:orig %lu, donor %lu]\n",
+				 orig_start, orig_blocks,
+				orig_inode->i_ino, donor_inode->i_ino);
+			return -EINVAL;
+		}
+
+		if (orig_start + *len > orig_blocks) {
+			ext4_debug("ext4 move extent: Adjust length "
+				"from %llu to %llu. Because it should be "
+				"less than original file blocks "
+				"[ino:orig %lu, donor %lu]\n",
+				*len, orig_blocks - orig_start,
+				orig_inode->i_ino, donor_inode->i_ino);
+			*len = orig_blocks - orig_start;
+		}
+	}
+
+	if (!*len) {
+		ext4_debug("ext4 move extent: len should not be 0 "
+			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+			donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp:		file structure of the original file
+ * @d_filp:		file structure of the donor file
+ * @orig_start:		start offset in block for orig
+ * @donor_start:	start offset in block for donor
+ * @len:		the number of blocks to be moved
+ * @moved_len:		moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_move_extents() proceeds the following order.
+ * 1:ext4_move_extents() calculates the last block number of moving extent
+ *   function by the start block number (orig_start) and the number of blocks
+ *   to be moved (len) specified as arguments.
+ *   If the {orig, donor}_start points a hole, the extent's start offset
+ *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ *   after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ *   or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call mext_next_extent()
+ *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ *   until find un-continuous extent, the start logical block number exceeds
+ *   the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ *   from orig_page_offset to seq_end_page.
+ *   The start indexes of data are specified as arguments.
+ *   That of the original inode is orig_page_offset,
+ *   and the donor inode is also orig_page_offset
+ *   (To easily handle blocksize != pagesize case, the offset for the
+ *   donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ *   then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ *   which shows the number of moved blocks.
+ *   The moved_len is useful for the command to calculate the file offset
+ *   for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp,
+		 __u64 orig_start, __u64 donor_start, __u64 len,
+		 __u64 *moved_len)
+{
+	struct inode *orig_inode = file_inode(o_filp);
+	struct inode *donor_inode = file_inode(d_filp);
+	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+	ext4_lblk_t block_start = orig_start;
+	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+	ext4_lblk_t rest_blocks;
+	pgoff_t orig_page_offset = 0, seq_end_page;
+	int ret, depth, last_extent = 0;
+	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+	int data_offset_in_page;
+	int block_len_in_page;
+	int unwritten;
+
+	if (orig_inode->i_sb != donor_inode->i_sb) {
+		ext4_debug("ext4 move extent: The argument files "
+			"should be in same FS [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* orig and donor should be different inodes */
+	if (orig_inode == donor_inode) {
+		ext4_debug("ext4 move extent: The argument files should not "
+			"be same inode [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
+	/* Regular file check */
+	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+		ext4_debug("ext4 move extent: The argument files should be "
+			"regular file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+	/* TODO: This is non obvious task to swap blocks for inodes with full
+	   jornaling enabled */
+	if (ext4_should_journal_data(orig_inode) ||
+	    ext4_should_journal_data(donor_inode)) {
+		return -EINVAL;
+	}
+	/* Protect orig and donor inodes against a truncate */
+	lock_two_nondirectories(orig_inode, donor_inode);
+
+	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(orig_inode);
+	ext4_inode_block_unlocked_dio(donor_inode);
+	inode_dio_wait(orig_inode);
+	inode_dio_wait(donor_inode);
+
+	/* Protect extent tree against block allocations via delalloc */
+	ext4_double_down_write_data_sem(orig_inode, donor_inode);
+	/* Check the filesystem environment whether move_extent can be done */
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+				    donor_start, &len);
+	if (ret)
+		goto out;
+
+	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+	block_end = block_start + len - 1;
+	if (file_end < block_end)
+		len -= block_end - file_end;
+
+	ret = get_ext_path(orig_inode, block_start, &orig_path);
+	if (ret)
+		goto out;
+
+	/* Get path structure to check the hole */
+	ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+	if (ret)
+		goto out;
+
+	depth = ext_depth(orig_inode);
+	ext_cur = holecheck_path[depth].p_ext;
+
+	/*
+	 * Get proper starting location of block replacement if block_start was
+	 * within the hole.
+	 */
+	if (le32_to_cpu(ext_cur->ee_block) +
+		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+		/*
+		 * The hole exists between extents or the tail of
+		 * original file.
+		 */
+		last_extent = mext_next_extent(orig_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+		last_extent = mext_next_extent(orig_inode, orig_path,
+							&ext_dummy);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+	} else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+		/* The hole exists at the beginning of original file. */
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+	else
+		seq_start = block_start;
+
+	/* No blocks within the specified range. */
+	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+		ext4_debug("ext4 move extent: The specified range of file "
+							"may be the hole\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Adjust start blocks */
+	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+		     max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+		seq_blocks += add_blocks;
+
+		/* Adjust tail blocks */
+		if (seq_start + seq_blocks - 1 > block_end)
+			seq_blocks = block_end - seq_start + 1;
+
+		ext_prev = ext_cur;
+		last_extent = mext_next_extent(orig_inode, holecheck_path,
+						&ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			break;
+		}
+		add_blocks = ext4_ext_get_actual_len(ext_cur);
+
+		/*
+		 * Extend the length of contiguous block (seq_blocks)
+		 * if extents are contiguous.
+		 */
+		if (ext4_can_extents_be_merged(orig_inode,
+					       ext_prev, ext_cur) &&
+		    block_end >= le32_to_cpu(ext_cur->ee_block) &&
+		    !last_extent)
+			continue;
+
+		/* Is original extent is unwritten */
+		unwritten = ext4_ext_is_unwritten(ext_prev);
+
+		data_offset_in_page = seq_start % blocks_per_page;
+
+		/*
+		 * Calculate data blocks count that should be swapped
+		 * at the first page.
+		 */
+		if (data_offset_in_page + seq_blocks > blocks_per_page) {
+			/* Swapped blocks are across pages */
+			block_len_in_page =
+					blocks_per_page - data_offset_in_page;
+		} else {
+			/* Swapped blocks are in a page */
+			block_len_in_page = seq_blocks;
+		}
+
+		orig_page_offset = seq_start >>
+				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+		seq_end_page = (seq_start + seq_blocks - 1) >>
+				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+		rest_blocks = seq_blocks;
+
+		/*
+		 * Up semaphore to avoid following problems:
+		 * a. transaction deadlock among ext4_journal_start,
+		 *    ->write_begin via pagefault, and jbd2_journal_commit
+		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
+		 *    in move_extent_per_page
+		 */
+		ext4_double_up_write_data_sem(orig_inode, donor_inode);
+
+		while (orig_page_offset <= seq_end_page) {
+
+			/* Swap original branches with new branches */
+			block_len_in_page = move_extent_per_page(
+						o_filp, donor_inode,
+						orig_page_offset,
+						data_offset_in_page,
+						block_len_in_page,
+						unwritten, &ret);
+
+			/* Count how many blocks we have exchanged */
+			*moved_len += block_len_in_page;
+			if (ret < 0)
+				break;
+			if (*moved_len > len) {
+				EXT4_ERROR_INODE(orig_inode,
+					"We replaced blocks too much! "
+					"sum of replaced: %llu requested: %llu",
+					*moved_len, len);
+				ret = -EIO;
+				break;
+			}
+
+			orig_page_offset++;
+			data_offset_in_page = 0;
+			rest_blocks -= block_len_in_page;
+			if (rest_blocks > blocks_per_page)
+				block_len_in_page = blocks_per_page;
+			else
+				block_len_in_page = rest_blocks;
+		}
+
+		ext4_double_down_write_data_sem(orig_inode, donor_inode);
+		if (ret < 0)
+			break;
+
+		/* Decrease buffer counter */
+		if (holecheck_path)
+			ext4_ext_drop_refs(holecheck_path);
+		ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+		if (ret)
+			break;
+		depth = holecheck_path->p_depth;
+
+		/* Decrease buffer counter */
+		if (orig_path)
+			ext4_ext_drop_refs(orig_path);
+		ret = get_ext_path(orig_inode, seq_start, &orig_path);
+		if (ret)
+			break;
+
+		ext_cur = holecheck_path[depth].p_ext;
+		add_blocks = ext4_ext_get_actual_len(ext_cur);
+		seq_blocks = 0;
+
+	}
+out:
+	if (*moved_len) {
+		ext4_discard_preallocations(orig_inode);
+		ext4_discard_preallocations(donor_inode);
+	}
+
+	if (orig_path) {
+		ext4_ext_drop_refs(orig_path);
+		kfree(orig_path);
+	}
+	if (holecheck_path) {
+		ext4_ext_drop_refs(holecheck_path);
+		kfree(holecheck_path);
+	}
+	ext4_double_up_write_data_sem(orig_inode, donor_inode);
+	ext4_inode_resume_unlocked_dio(orig_inode);
+	ext4_inode_resume_unlocked_dio(donor_inode);
+	unlock_two_nondirectories(orig_inode, donor_inode);
+
+	return ret;
+}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 28aa2ed4297..3520ab8a663 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -28,39 +28,131 @@
 #include <linux/pagemap.h>
 #include <linux/jbd2.h>
 #include <linux/time.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
+#include "ext4.h"
+#include "ext4_jbd2.h"
 
-#include "namei.h"
 #include "xattr.h"
 #include "acl.h"
 
+#include <trace/events/ext4.h>
 /*
  * define how far ahead to read directories while searching them.
  */
 #define NAMEI_RA_CHUNKS  2
 #define NAMEI_RA_BLOCKS  4
 #define NAMEI_RA_SIZE	     (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 
 static struct buffer_head *ext4_append(handle_t *handle,
 					struct inode *inode,
-					ext4_lblk_t *block, int *err)
+					ext4_lblk_t *block)
 {
 	struct buffer_head *bh;
+	int err = 0;
+
+	if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+		     ((inode->i_size >> 10) >=
+		      EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+		return ERR_PTR(-ENOSPC);
 
 	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
 
-	if ((bh = ext4_bread(handle, inode, *block, 1, err))) {
-		inode->i_size += inode->i_sb->s_blocksize;
-		EXT4_I(inode)->i_disksize = inode->i_size;
-		ext4_journal_get_write_access(handle,bh);
+	bh = ext4_bread(handle, inode, *block, 1, &err);
+	if (!bh)
+		return ERR_PTR(err);
+	inode->i_size += inode->i_sb->s_blocksize;
+	EXT4_I(inode)->i_disksize = inode->i_size;
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
+		brelse(bh);
+		ext4_std_error(inode->i_sb, err);
+		return ERR_PTR(err);
+	}
+	return bh;
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+			       struct ext4_dir_entry *dirent);
+
+typedef enum {
+	EITHER, INDEX, DIRENT
+} dirblock_type_t;
+
+#define ext4_read_dirblock(inode, block, type) \
+	__ext4_read_dirblock((inode), (block), (type), __LINE__)
+
+static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+					      ext4_lblk_t block,
+					      dirblock_type_t type,
+					      unsigned int line)
+{
+	struct buffer_head *bh;
+	struct ext4_dir_entry *dirent;
+	int err = 0, is_dx_block = 0;
+
+	bh = ext4_bread(NULL, inode, block, 0, &err);
+	if (!bh) {
+		if (err == 0) {
+			ext4_error_inode(inode, __func__, line, block,
+					       "Directory hole found");
+			return ERR_PTR(-EIO);
+		}
+		__ext4_warning(inode->i_sb, __func__, line,
+			       "error reading directory block "
+			       "(ino %lu, block %lu)", inode->i_ino,
+			       (unsigned long) block);
+		return ERR_PTR(err);
+	}
+	dirent = (struct ext4_dir_entry *) bh->b_data;
+	/* Determine whether or not we have an index block */
+	if (is_dx(inode)) {
+		if (block == 0)
+			is_dx_block = 1;
+		else if (ext4_rec_len_from_disk(dirent->rec_len,
+						inode->i_sb->s_blocksize) ==
+			 inode->i_sb->s_blocksize)
+			is_dx_block = 1;
+	}
+	if (!is_dx_block && type == INDEX) {
+		ext4_error_inode(inode, __func__, line, block,
+		       "directory leaf block found instead of index block");
+		return ERR_PTR(-EIO);
+	}
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
+	    buffer_verified(bh))
+		return bh;
+
+	/*
+	 * An empty leaf block can get mistaken for a index block; for
+	 * this reason, we can only check the index checksum when the
+	 * caller is sure it should be an index block.
+	 */
+	if (is_dx_block && type == INDEX) {
+		if (ext4_dx_csum_verify(inode, dirent))
+			set_buffer_verified(bh);
+		else {
+			ext4_error_inode(inode, __func__, line, block,
+				"Directory index failed checksum");
+			brelse(bh);
+			return ERR_PTR(-EIO);
+		}
+	}
+	if (!is_dx_block) {
+		if (ext4_dirent_csum_verify(inode, dirent))
+			set_buffer_verified(bh);
+		else {
+			ext4_error_inode(inode, __func__, line, block,
+				"Directory block failed checksum");
+			brelse(bh);
+			return ERR_PTR(-EIO);
+		}
 	}
 	return bh;
 }
@@ -69,10 +161,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
 
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
-
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -144,39 +232,281 @@ struct dx_map_entry
 	u16 size;
 };
 
+/*
+ * This goes at the end of each htree block.
+ */
+struct dx_tail {
+	u32 dt_reserved;
+	__le32 dt_checksum;	/* crc32c(uuid+inum+dirblock) */
+};
+
 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
 static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash (struct dx_entry *entry);
-static void dx_set_hash (struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count (struct dx_entry *entries);
-static unsigned dx_get_limit (struct dx_entry *entries);
-static void dx_set_count (struct dx_entry *entries, unsigned value);
-static void dx_set_limit (struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static inline unsigned dx_get_hash(struct dx_entry *entry);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count(struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit(struct inode *dir);
+static struct dx_frame *dx_probe(const struct qstr *d_name,
 				 struct inode *dir,
 				 struct dx_hash_info *hinfo,
 				 struct dx_frame *frame,
 				 int *err);
-static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
-			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_release(struct dx_frame *frames);
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+		       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
-		struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
+		struct dx_map_entry *offsets, int count, unsigned blocksize);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block(struct dx_frame *frame,
 					u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
 				 struct dx_frame *frames,
 				 __u32 *start_hash);
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
-		       struct ext4_dir_entry_2 **res_dir, int *err);
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+		const struct qstr *d_name,
+		struct ext4_dir_entry_2 **res_dir,
+		int *err);
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
+/* checksumming functions */
+void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+			    unsigned int blocksize)
+{
+	memset(t, 0, sizeof(struct ext4_dir_entry_tail));
+	t->det_rec_len = ext4_rec_len_to_disk(
+			sizeof(struct ext4_dir_entry_tail), blocksize);
+	t->det_reserved_ft = EXT4_FT_DIR_CSUM;
+}
+
+/* Walk through a dirent block to find a checksum "dirent" at the tail */
+static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
+						   struct ext4_dir_entry *de)
+{
+	struct ext4_dir_entry_tail *t;
+
+#ifdef PARANOID
+	struct ext4_dir_entry *d, *top;
+
+	d = de;
+	top = (struct ext4_dir_entry *)(((void *)de) +
+		(EXT4_BLOCK_SIZE(inode->i_sb) -
+		sizeof(struct ext4_dir_entry_tail)));
+	while (d < top && d->rec_len)
+		d = (struct ext4_dir_entry *)(((void *)d) +
+		    le16_to_cpu(d->rec_len));
+
+	if (d != top)
+		return NULL;
+
+	t = (struct ext4_dir_entry_tail *)d;
+#else
+	t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
+#endif
+
+	if (t->det_reserved_zero1 ||
+	    le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+	    t->det_reserved_zero2 ||
+	    t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+		return NULL;
+
+	return t;
+}
+
+static __le32 ext4_dirent_csum(struct inode *inode,
+			       struct ext4_dir_entry *dirent, int size)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__u32 csum;
+
+	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+	return cpu_to_le32(csum);
+}
+
+static void warn_no_space_for_csum(struct inode *inode)
+{
+	ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
+		     "checksum.  Please run e2fsck -D.", inode->i_ino);
+}
+
+int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+	struct ext4_dir_entry_tail *t;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
+
+	t = get_dirent_tail(inode, dirent);
+	if (!t) {
+		warn_no_space_for_csum(inode);
+		return 0;
+	}
+
+	if (t->det_checksum != ext4_dirent_csum(inode, dirent,
+						(void *)t - (void *)dirent))
+		return 0;
+
+	return 1;
+}
+
+static void ext4_dirent_csum_set(struct inode *inode,
+				 struct ext4_dir_entry *dirent)
+{
+	struct ext4_dir_entry_tail *t;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return;
+
+	t = get_dirent_tail(inode, dirent);
+	if (!t) {
+		warn_no_space_for_csum(inode);
+		return;
+	}
+
+	t->det_checksum = ext4_dirent_csum(inode, dirent,
+					   (void *)t - (void *)dirent);
+}
+
+int ext4_handle_dirty_dirent_node(handle_t *handle,
+				  struct inode *inode,
+				  struct buffer_head *bh)
+{
+	ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+	return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
+					       struct ext4_dir_entry *dirent,
+					       int *offset)
+{
+	struct ext4_dir_entry *dp;
+	struct dx_root_info *root;
+	int count_offset;
+
+	if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+		count_offset = 8;
+	else if (le16_to_cpu(dirent->rec_len) == 12) {
+		dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
+		if (le16_to_cpu(dp->rec_len) !=
+		    EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+			return NULL;
+		root = (struct dx_root_info *)(((void *)dp + 12));
+		if (root->reserved_zero ||
+		    root->info_length != sizeof(struct dx_root_info))
+			return NULL;
+		count_offset = 32;
+	} else
+		return NULL;
+
+	if (offset)
+		*offset = count_offset;
+	return (struct dx_countlimit *)(((void *)dirent) + count_offset);
+}
+
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+			   int count_offset, int count, struct dx_tail *t)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__u32 csum;
+	__le32 save_csum;
+	int size;
+
+	size = count_offset + (count * sizeof(struct dx_entry));
+	save_csum = t->dt_checksum;
+	t->dt_checksum = 0;
+	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+	csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
+	t->dt_checksum = save_csum;
+
+	return cpu_to_le32(csum);
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+			       struct ext4_dir_entry *dirent)
+{
+	struct dx_countlimit *c;
+	struct dx_tail *t;
+	int count_offset, limit, count;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
+
+	c = get_dx_countlimit(inode, dirent, &count_offset);
+	if (!c) {
+		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
+		return 1;
+	}
+	limit = le16_to_cpu(c->limit);
+	count = le16_to_cpu(c->count);
+	if (count_offset + (limit * sizeof(struct dx_entry)) >
+	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+		warn_no_space_for_csum(inode);
+		return 1;
+	}
+	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+	if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
+					    count, t))
+		return 0;
+	return 1;
+}
+
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+	struct dx_countlimit *c;
+	struct dx_tail *t;
+	int count_offset, limit, count;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return;
+
+	c = get_dx_countlimit(inode, dirent, &count_offset);
+	if (!c) {
+		EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
+		return;
+	}
+	limit = le16_to_cpu(c->limit);
+	count = le16_to_cpu(c->count);
+	if (count_offset + (limit * sizeof(struct dx_entry)) >
+	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+		warn_no_space_for_csum(inode);
+		return;
+	}
+	t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+	t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
+}
+
+static inline int ext4_handle_dirty_dx_node(handle_t *handle,
+					    struct inode *inode,
+					    struct buffer_head *bh)
+{
+	ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+	return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
+{
+	return (struct ext4_dir_entry_2 *)((char *)p +
+		ext4_rec_len_from_disk(p->rec_len, blocksize));
+}
+
 /*
  * Future: use high four bits of block for coalesce-on-delete flags
  * Mask them off for now.
@@ -192,59 +522,67 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
 	entry->block = cpu_to_le32(value);
 }
 
-static inline unsigned dx_get_hash (struct dx_entry *entry)
+static inline unsigned dx_get_hash(struct dx_entry *entry)
 {
 	return le32_to_cpu(entry->hash);
 }
 
-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
 {
 	entry->hash = cpu_to_le32(value);
 }
 
-static inline unsigned dx_get_count (struct dx_entry *entries)
+static inline unsigned dx_get_count(struct dx_entry *entries)
 {
 	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
 }
 
-static inline unsigned dx_get_limit (struct dx_entry *entries)
+static inline unsigned dx_get_limit(struct dx_entry *entries)
 {
 	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
 }
 
-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
 {
 	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
 }
 
-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
 {
 	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
 }
 
-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
 		EXT4_DIR_REC_LEN(2) - infosize;
-	return 0? 20: entry_space / sizeof(struct dx_entry);
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		entry_space -= sizeof(struct dx_tail);
+	return entry_space / sizeof(struct dx_entry);
 }
 
-static inline unsigned dx_node_limit (struct inode *dir)
+static inline unsigned dx_node_limit(struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-	return 0? 22: entry_space / sizeof(struct dx_entry);
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		entry_space -= sizeof(struct dx_tail);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 /*
  * Debug
  */
 #ifdef DX_DEBUG
-static void dx_show_index (char * label, struct dx_entry *entries)
+static void dx_show_index(char * label, struct dx_entry *entries)
 {
 	int i, n = dx_get_count (entries);
-	printk("%s index ", label);
+	printk(KERN_DEBUG "%s index ", label);
 	for (i = 0; i < n; i++) {
-		printk("%x->%lu ", i? dx_get_hash(entries + i) :
+		printk("%x->%lu ", i ? dx_get_hash(entries + i) :
 				0, (unsigned long)dx_get_block(entries + i));
 	}
 	printk("\n");
@@ -276,12 +614,12 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
 				while (len--) printk("%c", *name++);
 				ext4fs_dirhash(de->name, de->name_len, &h);
 				printk(":%x.%u ", h.hash,
-				       ((char *) de - base));
+				       (unsigned) ((char *) de - base));
 			}
 			space += EXT4_DIR_REC_LEN(de->name_len);
 			names++;
 		}
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, size);
 	}
 	printk("(%i)\n", names);
 	return (struct stats) { names, space, 1 };
@@ -291,7 +629,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 			     struct dx_entry *entries, int levels)
 {
 	unsigned blocksize = dir->i_sb->s_blocksize;
-	unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+	unsigned count = dx_get_count(entries), names = 0, space = 0, i;
 	unsigned bcount = 0;
 	struct buffer_head *bh;
 	int err;
@@ -310,11 +648,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 		names += stats.names;
 		space += stats.space;
 		bcount += stats.bcount;
-		brelse (bh);
+		brelse(bh);
 	}
 	if (bcount)
-		printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
-			names, space/bcount,(space/bcount)*100/blocksize);
+		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
+		       levels ? "" : "   ", names, space/bcount,
+		       (space/bcount)*100/blocksize);
 	return (struct stats) { names, space, bcount};
 }
 #endif /* DX_DEBUG */
@@ -329,7 +668,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
  * back to userspace.
  */
 static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(const struct qstr *d_name, struct inode *dir,
 	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
 {
 	unsigned count, indirect;
@@ -340,30 +679,31 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	u32 hash;
 
 	frame->bh = NULL;
-	if (dentry)
-		dir = dentry->d_parent->d_inode;
-	if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+	bh = ext4_read_dirblock(dir, 0, INDEX);
+	if (IS_ERR(bh)) {
+		*err = PTR_ERR(bh);
 		goto fail;
+	}
 	root = (struct dx_root *) bh->b_data;
 	if (root->info.hash_version != DX_HASH_TEA &&
 	    root->info.hash_version != DX_HASH_HALF_MD4 &&
 	    root->info.hash_version != DX_HASH_LEGACY) {
-		ext4_warning(dir->i_sb, __FUNCTION__,
-			     "Unrecognised inode hash code %d",
+		ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
 			     root->info.hash_version);
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
 		goto fail;
 	}
 	hinfo->hash_version = root->info.hash_version;
+	if (hinfo->hash_version <= DX_HASH_TEA)
+		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
-	if (dentry)
-		ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+	if (d_name)
+		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
 	hash = hinfo->hash;
 
 	if (root->info.unused_flags & 1) {
-		ext4_warning(dir->i_sb, __FUNCTION__,
-			     "Unimplemented inode hash flags: %#06x",
+		ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
 			     root->info.unused_flags);
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
@@ -371,8 +711,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 	}
 
 	if ((indirect = root->info.indirect_levels) > 1) {
-		ext4_warning(dir->i_sb, __FUNCTION__,
-			     "Unimplemented inode hash depth: %#06x",
+		ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
 			     root->info.indirect_levels);
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
@@ -384,19 +723,18 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 
 	if (dx_get_limit(entries) != dx_root_limit(dir,
 						   root->info.info_length)) {
-		ext4_warning(dir->i_sb, __FUNCTION__,
-			     "dx entry: limit != root limit");
+		ext4_warning(dir->i_sb, "dx entry: limit != root limit");
 		brelse(bh);
 		*err = ERR_BAD_DX_DIR;
 		goto fail;
 	}
 
-	dxtrace (printk("Look up %x", hash));
+	dxtrace(printk("Look up %x", hash));
 	while (1)
 	{
 		count = dx_get_count(entries);
 		if (!count || count > dx_get_limit(entries)) {
-			ext4_warning(dir->i_sb, __FUNCTION__,
+			ext4_warning(dir->i_sb,
 				     "dx entry: no count or count > limit");
 			brelse(bh);
 			*err = ERR_BAD_DX_DIR;
@@ -437,11 +775,15 @@ dx_probe(struct dentry *dentry, struct inode *dir,
 		frame->entries = entries;
 		frame->at = at;
 		if (!indirect--) return frame;
-		if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+		bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+		if (IS_ERR(bh)) {
+			*err = PTR_ERR(bh);
 			goto fail2;
-		at = entries = ((struct dx_node *) bh->b_data)->entries;
+		}
+		entries = ((struct dx_node *) bh->b_data)->entries;
+
 		if (dx_get_limit(entries) != dx_node_limit (dir)) {
-			ext4_warning(dir->i_sb, __FUNCTION__,
+			ext4_warning(dir->i_sb,
 				     "dx entry: limit != node limit");
 			brelse(bh);
 			*err = ERR_BAD_DX_DIR;
@@ -457,8 +799,8 @@ fail2:
 	}
 fail:
 	if (*err == ERR_BAD_DX_DIR)
-		ext4_warning(dir->i_sb, __FUNCTION__,
-			     "Corrupt dir inode %ld, running e2fsck is "
+		ext4_warning(dir->i_sb,
+			     "Corrupt dir inode %lu, running e2fsck is "
 			     "recommended.", dir->i_ino);
 	return NULL;
 }
@@ -497,7 +839,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 {
 	struct dx_frame *p;
 	struct buffer_head *bh;
-	int err, num_frames = 0;
+	int num_frames = 0;
 	__u32 bhash;
 
 	p = frame;
@@ -536,11 +878,11 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 	 * block so no check is necessary
 	 */
 	while (num_frames--) {
-		if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
-				      0, &err)))
-			return err; /* Failure */
+		bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
 		p++;
-		brelse (p->bh);
+		brelse(p->bh);
 		p->bh = bh;
 		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
 	}
@@ -549,15 +891,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 
 
 /*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
-	return (struct ext4_dir_entry_2 *)((char *)p +
-		ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
  * This function fills a red-black tree with information from a
  * directory block.  It returns the number directory entries loaded
  * into the tree.  If there is an error it is returned in err.
@@ -569,26 +902,25 @@ static int htree_dirblock_to_tree(struct file *dir_file,
 {
 	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de, *top;
-	int err, count = 0;
+	int err = 0, count = 0;
 
 	dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
 							(unsigned long)block));
-	if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
-		return err;
+	bh = ext4_read_dirblock(dir, block, DIRENT);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	top = (struct ext4_dir_entry_2 *) ((char *) de +
 					   dir->i_sb->s_blocksize -
 					   EXT4_DIR_REC_LEN(0));
-	for (; de < top; de = ext4_next_entry(de)) {
-		if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
-					(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
-						+((char *)de - bh->b_data))) {
-			/* On error, skip the f_pos to the next block. */
-			dir_file->f_pos = (dir_file->f_pos |
-					(dir->i_sb->s_blocksize - 1)) + 1;
-			brelse (bh);
-			return count;
+	for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+				bh->b_data, bh->b_size,
+				(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+					 + ((char *)de - bh->b_data))) {
+			/* silently ignore the rest of the block */
+			break;
 		}
 		ext4fs_dirhash(de->name, de->name_len, hinfo);
 		if ((hinfo->hash < start_hash) ||
@@ -629,12 +961,26 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	int ret, err;
 	__u32 hashval;
 
-	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
-		       start_minor_hash));
-	dir = dir_file->f_path.dentry->d_inode;
-	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
+		       start_hash, start_minor_hash));
+	dir = file_inode(dir_file);
+	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+		if (hinfo.hash_version <= DX_HASH_TEA)
+			hinfo.hash_version +=
+				EXT4_SB(dir->i_sb)->s_hash_unsigned;
 		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+		if (ext4_has_inline_data(dir)) {
+			int has_inline_data = 1;
+			count = htree_inlinedir_to_tree(dir_file, dir, 0,
+							&hinfo, start_hash,
+							start_minor_hash,
+							&has_inline_data);
+			if (has_inline_data) {
+				*next_hash = ~0;
+				return count;
+			}
+		}
 		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
 					       start_hash, start_minor_hash);
 		*next_hash = ~0;
@@ -642,7 +988,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	}
 	hinfo.hash = start_hash;
 	hinfo.minor_hash = 0;
-	frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+	frame = dx_probe(NULL, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 
@@ -655,7 +1001,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	}
 	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
 		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, dir->i_sb->s_blocksize);
 		if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
 			goto errout;
 		count++;
@@ -688,14 +1034,23 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 			break;
 	}
 	dx_release(frames);
-	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
-		       count, *next_hash));
+	dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
+		       "next hash: %x\n", count, *next_hash));
 	return count;
 errout:
 	dx_release(frames);
 	return (err);
 }
 
+static inline int search_dirblock(struct buffer_head *bh,
+				  struct inode *dir,
+				  const struct qstr *d_name,
+				  unsigned int offset,
+				  struct ext4_dir_entry_2 **res_dir)
+{
+	return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+			  d_name, offset, res_dir);
+}
 
 /*
  * Directory block splitting, compacting
@@ -705,26 +1060,26 @@ errout:
  * Create map of hash values, offsets, and sizes, stored at end of block.
  * Returns number of entries mapped.
  */
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
-			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+		       struct dx_hash_info *hinfo,
+		       struct dx_map_entry *map_tail)
 {
 	int count = 0;
 	char *base = (char *) de;
 	struct dx_hash_info h = *hinfo;
 
-	while ((char *) de < base + size)
-	{
+	while ((char *) de < base + blocksize) {
 		if (de->name_len && de->inode) {
 			ext4fs_dirhash(de->name, de->name_len, &h);
 			map_tail--;
 			map_tail->hash = h.hash;
-			map_tail->offs = (u16) ((char *) de - base);
+			map_tail->offs = ((char *) de - base)>>2;
 			map_tail->size = le16_to_cpu(de->rec_len);
 			count++;
 			cond_resched();
 		}
 		/* XXX: do we need to check rec_len == 0 case? -Chris */
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, blocksize);
 	}
 	return count;
 }
@@ -770,13 +1125,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
 	dx_set_count(entries, count + 1);
 }
 
-static void ext4_update_dx_flag(struct inode *inode)
-{
-	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
-				     EXT4_FEATURE_COMPAT_DIR_INDEX))
-		EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
-}
-
 /*
  * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
  *
@@ -796,20 +1144,22 @@ static inline int ext4_match (int len, const char * const name,
 /*
  * Returns 0 if not found, -1 on failure, and 1 on success
  */
-static inline int search_dirblock(struct buffer_head * bh,
-				  struct inode *dir,
-				  struct dentry *dentry,
-				  unsigned long offset,
-				  struct ext4_dir_entry_2 ** res_dir)
+int search_dir(struct buffer_head *bh,
+	       char *search_buf,
+	       int buf_size,
+	       struct inode *dir,
+	       const struct qstr *d_name,
+	       unsigned int offset,
+	       struct ext4_dir_entry_2 **res_dir)
 {
 	struct ext4_dir_entry_2 * de;
 	char * dlimit;
 	int de_len;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	const char *name = d_name->name;
+	int namelen = d_name->len;
 
-	de = (struct ext4_dir_entry_2 *) bh->b_data;
-	dlimit = bh->b_data + dir->i_sb->s_blocksize;
+	de = (struct ext4_dir_entry_2 *)search_buf;
+	dlimit = search_buf + buf_size;
 	while ((char *) de < dlimit) {
 		/* this code is executed quadratically often */
 		/* do minimal checking `by hand' */
@@ -817,14 +1167,15 @@ static inline int search_dirblock(struct buffer_head * bh,
 		if ((char *) de + namelen <= dlimit &&
 		    ext4_match (namelen, name, de)) {
 			/* found a match - just to be sure, do a full check */
-			if (!ext4_check_dir_entry("ext4_find_entry",
-						  dir, de, bh, offset))
+			if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+						 bh->b_size, offset))
 				return -1;
 			*res_dir = de;
 			return 1;
 		}
 		/* prevent looping on a bad block */
-		de_len = ext4_rec_len_from_disk(de->rec_len);
+		de_len = ext4_rec_len_from_disk(de->rec_len,
+						dir->i_sb->s_blocksize);
 		if (de_len <= 0)
 			return -1;
 		offset += de_len;
@@ -833,6 +1184,21 @@ static inline int search_dirblock(struct buffer_head * bh,
 	return 0;
 }
 
+static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+			       struct ext4_dir_entry *de)
+{
+	struct super_block *sb = dir->i_sb;
+
+	if (!is_dx(dir))
+		return 0;
+	if (block == 0)
+		return 1;
+	if (de->inode == 0 &&
+	    ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
+			sb->s_blocksize)
+		return 1;
+	return 0;
+}
 
 /*
  *	ext4_find_entry()
@@ -845,13 +1211,16 @@ static inline int search_dirblock(struct buffer_head * bh,
  * The returned buffer_head has ->b_count elevated.  The caller is expected
  * to brelse() it when appropriate.
  */
-static struct buffer_head * ext4_find_entry (struct dentry *dentry,
-					struct ext4_dir_entry_2 ** res_dir)
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+					const struct qstr *d_name,
+					struct ext4_dir_entry_2 **res_dir,
+					int *inlined)
 {
-	struct super_block * sb;
-	struct buffer_head * bh_use[NAMEI_RA_SIZE];
-	struct buffer_head * bh, *ret = NULL;
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
 	ext4_lblk_t start, block, b;
+	const u8 *name = d_name->name;
 	int ra_max = 0;		/* Number of bh's in the readahead
 				   buffer, bh_use[] */
 	int ra_ptr = 0;		/* Current index into readahead
@@ -859,16 +1228,37 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
 	int num = 0;
 	ext4_lblk_t  nblocks;
 	int i, err;
-	struct inode *dir = dentry->d_parent->d_inode;
 	int namelen;
 
 	*res_dir = NULL;
 	sb = dir->i_sb;
-	namelen = dentry->d_name.len;
+	namelen = d_name->len;
 	if (namelen > EXT4_NAME_LEN)
 		return NULL;
+
+	if (ext4_has_inline_data(dir)) {
+		int has_inline_data = 1;
+		ret = ext4_find_inline_entry(dir, d_name, res_dir,
+					     &has_inline_data);
+		if (has_inline_data) {
+			if (inlined)
+				*inlined = 1;
+			return ret;
+		}
+	}
+
+	if ((namelen <= 2) && (name[0] == '.') &&
+	    (name[1] == '.' || name[1] == '\0')) {
+		/*
+		 * "." or ".." will only be in the first block
+		 * NFS may look up ".."; "." should be handled by the VFS
+		 */
+		block = start = 0;
+		nblocks = 1;
+		goto restart;
+	}
 	if (is_dx(dir)) {
-		bh = ext4_dx_find_entry(dentry, res_dir, &err);
+		bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
 		/*
 		 * On success, or if the error was file not found,
 		 * return.  Otherwise, fall back to doing a search the
@@ -876,7 +1266,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
 		 */
 		if (bh || (err != ERR_BAD_DX_DIR))
 			return bh;
-		dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+		dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+			       "falling back\n"));
 	}
 	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
 	start = EXT4_I(dir)->i_dir_start_lookup;
@@ -906,7 +1297,8 @@ restart:
 				bh = ext4_getblk(NULL, dir, b++, 0, &err);
 				bh_use[ra_max] = bh;
 				if (bh)
-					ll_rw_block(READ_META, 1, &bh);
+					ll_rw_block(READ | REQ_META | REQ_PRIO,
+						    1, &bh);
 			}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -914,13 +1306,23 @@ restart:
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
-			ext4_error(sb, __FUNCTION__, "reading directory #%lu "
-				   "offset %lu", dir->i_ino,
-				   (unsigned long)block);
+			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+					 (unsigned long) block);
 			brelse(bh);
 			goto next;
 		}
-		i = search_dirblock(bh, dir, dentry,
+		if (!buffer_verified(bh) &&
+		    !is_dx_internal_node(dir, block,
+					 (struct ext4_dir_entry *)bh->b_data) &&
+		    !ext4_dirent_csum_verify(dir,
+				(struct ext4_dir_entry *)bh->b_data)) {
+			EXT4_ERROR_INODE(dir, "checksumming directory "
+					 "block %lu", (unsigned long)block);
+			brelse(bh);
+			goto next;
+		}
+		set_buffer_verified(bh);
+		i = search_dirblock(bh, dir, d_name,
 			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
 		if (i == 1) {
 			EXT4_I(dir)->i_dir_start_lookup = block;
@@ -950,64 +1352,47 @@ restart:
 cleanup_and_exit:
 	/* Clean up the read-ahead blocks */
 	for (; ra_ptr < ra_max; ra_ptr++)
-		brelse (bh_use[ra_ptr]);
+		brelse(bh_use[ra_ptr]);
 	return ret;
 }
 
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
 		       struct ext4_dir_entry_2 **res_dir, int *err)
 {
-	struct super_block * sb;
+	struct super_block * sb = dir->i_sb;
 	struct dx_hash_info	hinfo;
-	u32 hash;
 	struct dx_frame frames[2], *frame;
-	struct ext4_dir_entry_2 *de, *top;
 	struct buffer_head *bh;
 	ext4_lblk_t block;
 	int retval;
-	int namelen = dentry->d_name.len;
-	const u8 *name = dentry->d_name.name;
-	struct inode *dir = dentry->d_parent->d_inode;
 
-	sb = dir->i_sb;
-	/* NFS may look up ".." - look at dx_root directory block */
-	if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
-		if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
-			return NULL;
-	} else {
-		frame = frames;
-		frame->bh = NULL;			/* for dx_release() */
-		frame->at = (struct dx_entry *)frames;	/* hack for zero entry*/
-		dx_set_block(frame->at, 0);		/* dx_root block is 0 */
-	}
-	hash = hinfo.hash;
+	if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
+		return NULL;
 	do {
 		block = dx_get_block(frame->at);
-		if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+		bh = ext4_read_dirblock(dir, block, DIRENT);
+		if (IS_ERR(bh)) {
+			*err = PTR_ERR(bh);
 			goto errout;
-		de = (struct ext4_dir_entry_2 *) bh->b_data;
-		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
-				       EXT4_DIR_REC_LEN(0));
-		for (; de < top; de = ext4_next_entry(de))
-		if (ext4_match (namelen, name, de)) {
-			if (!ext4_check_dir_entry("ext4_find_entry",
-						  dir, de, bh,
-				  (block<<EXT4_BLOCK_SIZE_BITS(sb))
-					  +((char *)de - bh->b_data))) {
-				brelse (bh);
-				*err = ERR_BAD_DX_DIR;
-				goto errout;
-			}
-			*res_dir = de;
-			dx_release (frames);
+		}
+		retval = search_dirblock(bh, dir, d_name,
+					 block << EXT4_BLOCK_SIZE_BITS(sb),
+					 res_dir);
+		if (retval == 1) { 	/* Success! */
+			dx_release(frames);
 			return bh;
 		}
-		brelse (bh);
+		brelse(bh);
+		if (retval == -1) {
+			*err = ERR_BAD_DX_DIR;
+			goto errout;
+		}
+
 		/* Check to see if we should continue to search */
-		retval = ext4_htree_next_block(dir, hash, frame,
+		retval = ext4_htree_next_block(dir, hinfo.hash, frame,
 					       frames, NULL);
 		if (retval < 0) {
-			ext4_warning(sb, __FUNCTION__,
+			ext4_warning(sb,
 			     "error reading index page in directory #%lu",
 			     dir->i_ino);
 			*err = retval;
@@ -1017,33 +1402,41 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 
 	*err = -ENOENT;
 errout:
-	dxtrace(printk("%s not found\n", name));
+	dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
 	dx_release (frames);
 	return NULL;
 }
 
-static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
-	struct inode * inode;
-	struct ext4_dir_entry_2 * de;
-	struct buffer_head * bh;
+	struct inode *inode;
+	struct ext4_dir_entry_2 *de;
+	struct buffer_head *bh;
 
 	if (dentry->d_name.len > EXT4_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	bh = ext4_find_entry(dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
 	inode = NULL;
 	if (bh) {
-		unsigned long ino = le32_to_cpu(de->inode);
-		brelse (bh);
+		__u32 ino = le32_to_cpu(de->inode);
+		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
-			ext4_error(dir->i_sb, "ext4_lookup",
-				   "bad inode number: %lu", ino);
+			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
+			return ERR_PTR(-EIO);
+		}
+		if (unlikely(ino == dir->i_ino)) {
+			EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
+					 dentry);
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
-		if (IS_ERR(inode))
-			return ERR_CAST(inode);
+		if (inode == ERR_PTR(-ESTALE)) {
+			EXT4_ERROR_INODE(dir,
+					 "deleted inode referenced: %u",
+					 ino);
+			return ERR_PTR(-EIO);
+		}
 	}
 	return d_splice_alias(inode, dentry);
 }
@@ -1051,58 +1444,24 @@ static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, str
 
 struct dentry *ext4_get_parent(struct dentry *child)
 {
-	unsigned long ino;
-	struct dentry *parent;
-	struct inode *inode;
-	struct dentry dotdot;
+	__u32 ino;
+	static const struct qstr dotdot = QSTR_INIT("..", 2);
 	struct ext4_dir_entry_2 * de;
 	struct buffer_head *bh;
 
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-	dotdot.d_parent = child; /* confusing, isn't it! */
-
-	bh = ext4_find_entry(&dotdot, &de);
-	inode = NULL;
+	bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
 	if (!bh)
 		return ERR_PTR(-ENOENT);
 	ino = le32_to_cpu(de->inode);
 	brelse(bh);
 
 	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-		ext4_error(child->d_inode->i_sb, "ext4_get_parent",
-			   "bad inode number: %lu", ino);
+		EXT4_ERROR_INODE(child->d_inode,
+				 "bad parent inode number: %u", ino);
 		return ERR_PTR(-EIO);
 	}
 
-	inode = ext4_iget(child->d_inode->i_sb, ino);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	parent = d_alloc_anon(inode);
-	if (!parent) {
-		iput(inode);
-		parent = ERR_PTR(-ENOMEM);
-	}
-	return parent;
-}
-
-#define S_SHIFT 12
-static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
-	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
-};
-
-static inline void ext4_set_de_type(struct super_block *sb,
-				struct ext4_dir_entry_2 *de,
-				umode_t mode) {
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
-		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+	return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
 }
 
 /*
@@ -1110,16 +1469,18 @@ static inline void ext4_set_de_type(struct super_block *sb,
  * Returns pointer to last entry moved.
  */
 static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+		unsigned blocksize)
 {
 	unsigned rec_len = 0;
 
 	while (count--) {
-		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+						(from + (map->offs<<2));
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
 		((struct ext4_dir_entry_2 *) to)->rec_len =
-				ext4_rec_len_to_disk(rec_len);
+				ext4_rec_len_to_disk(rec_len, blocksize);
 		de->inode = 0;
 		map++;
 		to += rec_len;
@@ -1131,19 +1492,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
  * Compact each dir entry in the range to the minimal rec_len.
  * Returns pointer to last entry in range.
  */
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
 {
 	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
 	unsigned rec_len = 0;
 
 	prev = to = de;
-	while ((char*)de < base + size) {
-		next = ext4_next_entry(de);
+	while ((char*)de < base + blocksize) {
+		next = ext4_next_entry(de, blocksize);
 		if (de->inode && de->name_len) {
 			rec_len = EXT4_DIR_REC_LEN(de->name_len);
 			if (de > to)
 				memmove(to, de, rec_len);
-			to->rec_len = ext4_rec_len_to_disk(rec_len);
+			to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
 			prev = to;
 			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
 		}
@@ -1168,15 +1529,22 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	u32 hash2;
 	struct dx_map_entry *map;
 	char *data1 = (*bh)->b_data, *data2;
-	unsigned split, move, size, i;
+	unsigned split, move, size;
 	struct ext4_dir_entry_2 *de = NULL, *de2;
-	int	err = 0;
+	struct ext4_dir_entry_tail *t;
+	int	csum_size = 0;
+	int	err = 0, i;
 
-	bh2 = ext4_append (handle, dir, &newblock, &err);
-	if (!(bh2)) {
+	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	bh2 = ext4_append(handle, dir, &newblock);
+	if (IS_ERR(bh2)) {
 		brelse(*bh);
 		*bh = NULL;
-		goto errout;
+		*error = PTR_ERR(bh2);
+		return NULL;
 	}
 
 	BUFFER_TRACE(*bh, "get_write_access");
@@ -1193,10 +1561,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 
 	/* create map in the end of data2 block */
 	map = (struct dx_map_entry *) (data2 + blocksize);
-	count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+	count = dx_make_map((struct ext4_dir_entry_2 *) data1,
 			     blocksize, hinfo, map);
 	map -= count;
-	dx_sort_map (map, count);
+	dx_sort_map(map, count);
 	/* Split the existing block in the middle, size-wise */
 	size = 0;
 	move = 0;
@@ -1216,10 +1584,22 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 					hash2, split, count-split));
 
 	/* Fancy dance to stay within two buffers */
-	de2 = dx_move_dirents(data1, data2, map + split, count - split);
-	de = dx_pack_dirents(data1,blocksize);
-	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
-	de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+	de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
+	de = dx_pack_dirents(data1, blocksize);
+	de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+					   (char *) de,
+					   blocksize);
+	de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+					    (char *) de2,
+					    blocksize);
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(data2, blocksize);
+		initialize_dirent_tail(t, blocksize);
+
+		t = EXT4_DIRENT_TAIL(data1, blocksize);
+		initialize_dirent_tail(t, blocksize);
+	}
+
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
 	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
 
@@ -1229,15 +1609,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 		swap(*bh, bh2);
 		de = de2;
 	}
-	dx_insert_block (frame, hash2 + continued, newblock);
-	err = ext4_journal_dirty_metadata (handle, bh2);
+	dx_insert_block(frame, hash2 + continued, newblock);
+	err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
 	if (err)
 		goto journal_error;
-	err = ext4_journal_dirty_metadata (handle, frame->bh);
+	err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
 	if (err)
 		goto journal_error;
-	brelse (bh2);
-	dxtrace(dx_show_index ("frame", frame->entries));
+	brelse(bh2);
+	dxtrace(dx_show_index("frame", frame->entries));
 	return de;
 
 journal_error:
@@ -1245,11 +1625,67 @@ journal_error:
 	brelse(bh2);
 	*bh = NULL;
 	ext4_std_error(dir->i_sb, err);
-errout:
 	*error = err;
 	return NULL;
 }
 
+int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+		      struct buffer_head *bh,
+		      void *buf, int buf_size,
+		      const char *name, int namelen,
+		      struct ext4_dir_entry_2 **dest_de)
+{
+	struct ext4_dir_entry_2 *de;
+	unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+	int nlen, rlen;
+	unsigned int offset = 0;
+	char *top;
+
+	de = (struct ext4_dir_entry_2 *)buf;
+	top = buf + buf_size - reclen;
+	while ((char *) de <= top) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+					 buf, buf_size, offset))
+			return -EIO;
+		if (ext4_match(namelen, name, de))
+			return -EEXIST;
+		nlen = EXT4_DIR_REC_LEN(de->name_len);
+		rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+		if ((de->inode ? rlen - nlen : rlen) >= reclen)
+			break;
+		de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+		offset += rlen;
+	}
+	if ((char *) de > top)
+		return -ENOSPC;
+
+	*dest_de = de;
+	return 0;
+}
+
+void ext4_insert_dentry(struct inode *inode,
+			struct ext4_dir_entry_2 *de,
+			int buf_size,
+			const char *name, int namelen)
+{
+
+	int nlen, rlen;
+
+	nlen = EXT4_DIR_REC_LEN(de->name_len);
+	rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+	if (de->inode) {
+		struct ext4_dir_entry_2 *de1 =
+				(struct ext4_dir_entry_2 *)((char *)de + nlen);
+		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
+		de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
+		de = de1;
+	}
+	de->file_type = EXT4_FT_UNKNOWN;
+	de->inode = cpu_to_le32(inode->i_ino);
+	ext4_set_de_type(inode->i_sb, de, inode->i_mode);
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+}
 /*
  * Add a new entry into a directory (leaf) block.  If de is non-NULL,
  * it points to a directory entry which is guaranteed to be large
@@ -1257,71 +1693,39 @@ errout:
  * add_dirent_to_buf will attempt search the directory block for
  * space.  It will return -ENOSPC if no space is available, and -EIO
  * and -EEXIST if directory entry already exists.
- *
- * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
- * all other cases bh is released.
  */
 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode, struct ext4_dir_entry_2 *de,
-			     struct buffer_head * bh)
+			     struct buffer_head *bh)
 {
 	struct inode	*dir = dentry->d_parent->d_inode;
 	const char	*name = dentry->d_name.name;
 	int		namelen = dentry->d_name.len;
-	unsigned long	offset = 0;
-	unsigned short	reclen;
-	int		nlen, rlen, err;
-	char		*top;
+	unsigned int	blocksize = dir->i_sb->s_blocksize;
+	int		csum_size = 0;
+	int		err;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
 
-	reclen = EXT4_DIR_REC_LEN(namelen);
 	if (!de) {
-		de = (struct ext4_dir_entry_2 *)bh->b_data;
-		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
-		while ((char *) de <= top) {
-			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
-						  bh, offset)) {
-				brelse (bh);
-				return -EIO;
-			}
-			if (ext4_match (namelen, name, de)) {
-				brelse (bh);
-				return -EEXIST;
-			}
-			nlen = EXT4_DIR_REC_LEN(de->name_len);
-			rlen = ext4_rec_len_from_disk(de->rec_len);
-			if ((de->inode? rlen - nlen: rlen) >= reclen)
-				break;
-			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
-			offset += rlen;
-		}
-		if ((char *) de > top)
-			return -ENOSPC;
+		err = ext4_find_dest_de(dir, inode,
+					bh, bh->b_data, blocksize - csum_size,
+					name, namelen, &de);
+		if (err)
+			return err;
 	}
 	BUFFER_TRACE(bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, bh);
 	if (err) {
 		ext4_std_error(dir->i_sb, err);
-		brelse(bh);
 		return err;
 	}
 
 	/* By now the buffer is marked for journaling */
-	nlen = EXT4_DIR_REC_LEN(de->name_len);
-	rlen = ext4_rec_len_from_disk(de->rec_len);
-	if (de->inode) {
-		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-		de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
-		de->rec_len = ext4_rec_len_to_disk(nlen);
-		de = de1;
-	}
-	de->file_type = EXT4_FT_UNKNOWN;
-	if (inode) {
-		de->inode = cpu_to_le32(inode->i_ino);
-		ext4_set_de_type(dir->i_sb, de, inode->i_mode);
-	} else
-		de->inode = 0;
-	de->name_len = namelen;
-	memcpy (de->name, name, namelen);
+	ext4_insert_dentry(inode, de, blocksize, name, namelen);
+
 	/*
 	 * XXX shouldn't update any times until successful
 	 * completion of syscall, but too many callers depend
@@ -1337,11 +1741,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
 	ext4_mark_inode_dirty(handle, dir);
-	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-	err = ext4_journal_dirty_metadata(handle, bh);
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
 	if (err)
 		ext4_std_error(dir->i_sb, err);
-	brelse(bh);
 	return 0;
 }
 
@@ -1360,6 +1763,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	struct dx_frame	frames[2], *frame;
 	struct dx_entry *entries;
 	struct ext4_dir_entry_2	*de, *de2;
+	struct ext4_dir_entry_tail *t;
 	char		*data1, *top;
 	unsigned	len;
 	int		retval;
@@ -1367,9 +1771,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	struct dx_hash_info hinfo;
 	ext4_lblk_t  block;
 	struct fake_dirent *fde;
+	int		csum_size = 0;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	blocksize =  dir->i_sb->s_blocksize;
-	dxtrace(printk("Creating index\n"));
+	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+	BUFFER_TRACE(bh, "get_write_access");
 	retval = ext4_journal_get_write_access(handle, bh);
 	if (retval) {
 		ext4_std_error(dir->i_sb, retval);
@@ -1378,38 +1788,56 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	}
 	root = (struct dx_root *) bh->b_data;
 
-	bh2 = ext4_append (handle, dir, &block, &retval);
-	if (!(bh2)) {
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext4_dir_entry_2 *)((char *)fde +
+		ext4_rec_len_from_disk(fde->rec_len, blocksize));
+	if ((char *) de >= (((char *) root) + blocksize)) {
+		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
 		brelse(bh);
-		return retval;
+		return -EIO;
 	}
-	EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+	len = ((char *) root) + (blocksize - csum_size) - (char *) de;
+
+	/* Allocate new block for the 0th block's dirents */
+	bh2 = ext4_append(handle, dir, &block);
+	if (IS_ERR(bh2)) {
+		brelse(bh);
+		return PTR_ERR(bh2);
+	}
+	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
 	data1 = bh2->b_data;
 
-	/* The 0th block becomes the root, move the dirents out */
-	fde = &root->dotdot;
-	de = (struct ext4_dir_entry_2 *)((char *)fde +
-		ext4_rec_len_from_disk(fde->rec_len));
-	len = ((char *) root) + blocksize - (char *) de;
 	memcpy (data1, de, len);
 	de = (struct ext4_dir_entry_2 *) data1;
 	top = data1 + len;
-	while ((char *)(de2 = ext4_next_entry(de)) < top)
+	while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
 		de = de2;
-	de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+	de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+					   (char *) de,
+					   blocksize);
+
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(data1, blocksize);
+		initialize_dirent_tail(t, blocksize);
+	}
+
 	/* Initialize the root; the dot dirents already exist */
 	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+	de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+					   blocksize);
 	memset (&root->info, 0, sizeof(root->info));
 	root->info.info_length = sizeof(root->info);
 	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
 	entries = root->entries;
-	dx_set_block (entries, 1);
-	dx_set_count (entries, 1);
-	dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+	dx_set_block(entries, 1);
+	dx_set_count(entries, 1);
+	dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
 
 	/* Initialize as for dx_probe */
 	hinfo.hash_version = root->info.hash_version;
+	if (hinfo.hash_version <= DX_HASH_TEA)
+		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 	ext4fs_dirhash(name, namelen, &hinfo);
 	frame = frames;
@@ -1417,12 +1845,26 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	frame->at = entries;
 	frame->bh = bh;
 	bh = bh2;
+
+	ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+	ext4_handle_dirty_dirent_node(handle, dir, bh);
+
 	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
-	dx_release (frames);
-	if (!(de))
+	if (!de) {
+		/*
+		 * Even if the block split failed, we have to properly write
+		 * out all the changes we did so far. Otherwise we can end up
+		 * with corrupted filesystem.
+		 */
+		ext4_mark_inode_dirty(handle, dir);
+		dx_release(frames);
 		return retval;
+	}
+	dx_release(frames);
 
-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	brelse(bh);
+	return retval;
 }
 
 /*
@@ -1435,52 +1877,81 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
  * may not sleep between calling this and putting something into
  * the entry, as someone else might have used it while you slept.
  */
-static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
-	struct inode *inode)
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+			  struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	unsigned long offset;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de;
-	struct super_block * sb;
+	struct ext4_dir_entry_tail *t;
+	struct super_block *sb;
 	int	retval;
 	int	dx_fallback=0;
 	unsigned blocksize;
 	ext4_lblk_t block, blocks;
+	int	csum_size = 0;
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
 
 	sb = dir->i_sb;
 	blocksize = sb->s_blocksize;
 	if (!dentry->d_name.len)
 		return -EINVAL;
+
+	if (ext4_has_inline_data(dir)) {
+		retval = ext4_try_add_inline_entry(handle, dentry, inode);
+		if (retval < 0)
+			return retval;
+		if (retval == 1) {
+			retval = 0;
+			return retval;
+		}
+	}
+
 	if (is_dx(dir)) {
 		retval = ext4_dx_add_entry(handle, dentry, inode);
 		if (!retval || (retval != ERR_BAD_DX_DIR))
 			return retval;
-		EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
 		dx_fallback++;
 		ext4_mark_inode_dirty(handle, dir);
 	}
 	blocks = dir->i_size >> sb->s_blocksize_bits;
-	for (block = 0, offset = 0; block < blocks; block++) {
-		bh = ext4_bread(handle, dir, block, 0, &retval);
-		if(!bh)
-			return retval;
+	for (block = 0; block < blocks; block++) {
+		bh = ext4_read_dirblock(dir, block, DIRENT);
+		if (IS_ERR(bh))
+			return PTR_ERR(bh);
+
 		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-		if (retval != -ENOSPC)
+		if (retval != -ENOSPC) {
+			brelse(bh);
 			return retval;
+		}
 
 		if (blocks == 1 && !dx_fallback &&
 		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
 			return make_indexed_dir(handle, dentry, inode, bh);
 		brelse(bh);
 	}
-	bh = ext4_append(handle, dir, &block, &retval);
-	if (!bh)
-		return retval;
+	bh = ext4_append(handle, dir, &block);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
-	de->rec_len = ext4_rec_len_to_disk(blocksize);
-	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+	de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
+
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
+		initialize_dirent_tail(t, blocksize);
+	}
+
+	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	brelse(bh);
+	if (retval == 0)
+		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+	return retval;
 }
 
 /*
@@ -1492,20 +1963,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 	struct dx_frame frames[2], *frame;
 	struct dx_entry *entries, *at;
 	struct dx_hash_info hinfo;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct inode *dir = dentry->d_parent->d_inode;
-	struct super_block * sb = dir->i_sb;
+	struct super_block *sb = dir->i_sb;
 	struct ext4_dir_entry_2 *de;
 	int err;
 
-	frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+	frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
 	if (!frame)
 		return err;
 	entries = frame->entries;
 	at = frame->at;
-
-	if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+	bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
+	if (IS_ERR(bh)) {
+		err = PTR_ERR(bh);
+		bh = NULL;
 		goto cleanup;
+	}
 
 	BUFFER_TRACE(bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, bh);
@@ -1513,13 +1987,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 		goto journal_error;
 
 	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-	if (err != -ENOSPC) {
-		bh = NULL;
+	if (err != -ENOSPC)
 		goto cleanup;
-	}
 
 	/* Block full, should compress but for now just split */
-	dxtrace(printk("using %u of %u node entries\n",
+	dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
 		       dx_get_count(entries), dx_get_limit(entries)));
 	/* Need to split index? */
 	if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1532,18 +2004,20 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 
 		if (levels && (dx_get_count(frames->entries) ==
 			       dx_get_limit(frames->entries))) {
-			ext4_warning(sb, __FUNCTION__,
-				     "Directory index full!");
+			ext4_warning(sb, "Directory index full!");
 			err = -ENOSPC;
 			goto cleanup;
 		}
-		bh2 = ext4_append (handle, dir, &newblock, &err);
-		if (!(bh2))
+		bh2 = ext4_append(handle, dir, &newblock);
+		if (IS_ERR(bh2)) {
+			err = PTR_ERR(bh2);
 			goto cleanup;
+		}
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
-		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
-		node2->fake.inode = 0;
+		memset(&node2->fake, 0, sizeof(struct fake_dirent));
+		node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+							   sb->s_blocksize);
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, frame->bh);
 		if (err)
@@ -1551,7 +2025,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 		if (levels) {
 			unsigned icount1 = icount/2, icount2 = icount - icount1;
 			unsigned hash2 = dx_get_hash(entries + icount1);
-			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+			dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+				       icount1, icount2));
 
 			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
 			err = ext4_journal_get_write_access(handle,
@@ -1559,11 +2034,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 
-			memcpy ((char *) entries2, (char *) (entries + icount1),
-				icount2 * sizeof(struct dx_entry));
-			dx_set_count (entries, icount1);
-			dx_set_count (entries2, icount2);
-			dx_set_limit (entries2, dx_node_limit(dir));
+			memcpy((char *) entries2, (char *) (entries + icount1),
+			       icount2 * sizeof(struct dx_entry));
+			dx_set_count(entries, icount1);
+			dx_set_count(entries2, icount2);
+			dx_set_limit(entries2, dx_node_limit(dir));
 
 			/* Which index block gets the new entry? */
 			if (at - entries >= icount1) {
@@ -1571,16 +2046,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 				frame->entries = entries = entries2;
 				swap(frame->bh, bh2);
 			}
-			dx_insert_block (frames + 0, hash2, newblock);
-			dxtrace(dx_show_index ("node", frames[1].entries));
-			dxtrace(dx_show_index ("node",
+			dx_insert_block(frames + 0, hash2, newblock);
+			dxtrace(dx_show_index("node", frames[1].entries));
+			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
-			err = ext4_journal_dirty_metadata(handle, bh2);
+			err = ext4_handle_dirty_dx_node(handle, dir, bh2);
 			if (err)
 				goto journal_error;
 			brelse (bh2);
 		} else {
-			dxtrace(printk("Creating second level index...\n"));
+			dxtrace(printk(KERN_DEBUG
+				       "Creating second level index...\n"));
 			memcpy((char *) entries2, (char *) entries,
 			       icount * sizeof(struct dx_entry));
 			dx_set_limit(entries2, dx_node_limit(dir));
@@ -1600,63 +2076,111 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 		}
-		ext4_journal_dirty_metadata(handle, frames[0].bh);
+		err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
+		if (err) {
+			ext4_std_error(inode->i_sb, err);
+			goto cleanup;
+		}
 	}
 	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
 	if (!de)
 		goto cleanup;
 	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-	bh = NULL;
 	goto cleanup;
 
 journal_error:
 	ext4_std_error(dir->i_sb, err);
 cleanup:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 	dx_release(frames);
 	return err;
 }
 
 /*
- * ext4_delete_entry deletes a directory entry by merging it with the
- * previous entry
+ * ext4_generic_delete_entry deletes a directory entry by merging it
+ * with the previous entry
  */
-static int ext4_delete_entry (handle_t *handle,
-			      struct inode * dir,
-			      struct ext4_dir_entry_2 * de_del,
-			      struct buffer_head * bh)
+int ext4_generic_delete_entry(handle_t *handle,
+			      struct inode *dir,
+			      struct ext4_dir_entry_2 *de_del,
+			      struct buffer_head *bh,
+			      void *entry_buf,
+			      int buf_size,
+			      int csum_size)
 {
-	struct ext4_dir_entry_2 * de, * pde;
+	struct ext4_dir_entry_2 *de, *pde;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
 	int i;
 
 	i = 0;
 	pde = NULL;
-	de = (struct ext4_dir_entry_2 *) bh->b_data;
-	while (i < bh->b_size) {
-		if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
+	de = (struct ext4_dir_entry_2 *)entry_buf;
+	while (i < buf_size - csum_size) {
+		if (ext4_check_dir_entry(dir, NULL, de, bh,
+					 bh->b_data, bh->b_size, i))
 			return -EIO;
 		if (de == de_del)  {
-			BUFFER_TRACE(bh, "get_write_access");
-			ext4_journal_get_write_access(handle, bh);
 			if (pde)
 				pde->rec_len = ext4_rec_len_to_disk(
-					ext4_rec_len_from_disk(pde->rec_len) +
-					ext4_rec_len_from_disk(de->rec_len));
+					ext4_rec_len_from_disk(pde->rec_len,
+							       blocksize) +
+					ext4_rec_len_from_disk(de->rec_len,
+							       blocksize),
+					blocksize);
 			else
 				de->inode = 0;
 			dir->i_version++;
-			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-			ext4_journal_dirty_metadata(handle, bh);
 			return 0;
 		}
-		i += ext4_rec_len_from_disk(de->rec_len);
+		i += ext4_rec_len_from_disk(de->rec_len, blocksize);
 		pde = de;
-		de = ext4_next_entry(de);
+		de = ext4_next_entry(de, blocksize);
 	}
 	return -ENOENT;
 }
 
+static int ext4_delete_entry(handle_t *handle,
+			     struct inode *dir,
+			     struct ext4_dir_entry_2 *de_del,
+			     struct buffer_head *bh)
+{
+	int err, csum_size = 0;
+
+	if (ext4_has_inline_data(dir)) {
+		int has_inline_data = 1;
+		err = ext4_delete_inline_entry(handle, dir, de_del, bh,
+					       &has_inline_data);
+		if (has_inline_data)
+			return err;
+	}
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (unlikely(err))
+		goto out;
+
+	err = ext4_generic_delete_entry(handle, dir, de_del,
+					bh, bh->b_data,
+					dir->i_sb->s_blocksize, csum_size);
+	if (err)
+		goto out;
+
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+	if (unlikely(err))
+		goto out;
+
+	return 0;
+out:
+	if (err != -ENOENT)
+		ext4_std_error(dir->i_sb, err);
+	return err;
+}
+
 /*
  * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
  * since this indicates that nlinks count was previously 1.
@@ -1667,7 +2191,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
 	if (is_dx(inode) && inode->i_nlink > 1) {
 		/* limit is 16-bit i_links_count */
 		if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
-			inode->i_nlink = 1;
+			set_nlink(inode, 1);
 			EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
 					      EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
 		}
@@ -1680,9 +2204,8 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
  */
 static void ext4_dec_count(handle_t *handle, struct inode *inode)
 {
-	drop_nlink(inode);
-	if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0)
-		inc_nlink(inode);
+	if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+		drop_nlink(inode);
 }
 
 
@@ -1692,10 +2215,12 @@ static int ext4_add_nondir(handle_t *handle,
 	int err = ext4_add_entry(handle, dentry, inode);
 	if (!err) {
 		ext4_mark_inode_dirty(handle, inode);
+		unlock_new_inode(inode);
 		d_instantiate(dentry, inode);
 		return 0;
 	}
 	drop_nlink(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return err;
 }
@@ -1708,138 +2233,236 @@ static int ext4_add_nondir(handle_t *handle,
  * If the create succeeds, we fill in the inode information
  * with d_instantiate().
  */
-static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
-		struct nameidata *nd)
+static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		       bool excl)
 {
 	handle_t *handle;
-	struct inode * inode;
-	int err, retries = 0;
-
-retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	struct inode *inode;
+	int err, credits, retries = 0;
 
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+	dquot_initialize(dir);
 
-	inode = ext4_new_inode (handle, dir, mode);
+	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+					    NULL, EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
 		err = ext4_add_nondir(handle, dentry, inode);
+		if (!err && IS_DIRSYNC(dir))
+			ext4_handle_sync(handle);
 	}
-	ext4_journal_stop(handle);
+	if (handle)
+		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
 }
 
-static int ext4_mknod (struct inode * dir, struct dentry *dentry,
-			int mode, dev_t rdev)
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
+		      umode_t mode, dev_t rdev)
 {
 	handle_t *handle;
 	struct inode *inode;
-	int err, retries = 0;
+	int err, credits, retries = 0;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+	dquot_initialize(dir);
 
-	inode = ext4_new_inode (handle, dir, mode);
+	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+	inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
+					    NULL, EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
 		inode->i_op = &ext4_special_inode_operations;
-#endif
 		err = ext4_add_nondir(handle, dentry, inode);
+		if (!err && IS_DIRSYNC(dir))
+			ext4_handle_sync(handle);
 	}
-	ext4_journal_stop(handle);
+	if (handle)
+		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
 }
 
-static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	handle_t *handle;
-	struct inode * inode;
-	struct buffer_head * dir_block;
-	struct ext4_dir_entry_2 * de;
+	struct inode *inode;
 	int err, retries = 0;
 
-	if (EXT4_DIR_LINK_MAX(dir))
-		return -EMLINK;
+	dquot_initialize(dir);
 
 retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	inode = ext4_new_inode_start_handle(dir, mode,
+					    NULL, 0, NULL,
+					    EXT4_HT_DIR,
+			EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  4 + EXT4_XATTR_TRANS_BLOCKS);
+	handle = ext4_journal_current_handle();
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext4_file_inode_operations;
+		inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(inode);
+		d_tmpfile(dentry, inode);
+		err = ext4_orphan_add(handle, inode);
+		if (err)
+			goto err_unlock_inode;
+		mark_inode_dirty(inode);
+		unlock_new_inode(inode);
+	}
+	if (handle)
+		ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+err_unlock_inode:
+	ext4_journal_stop(handle);
+	unlock_new_inode(inode);
+	return err;
+}
 
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+			  struct ext4_dir_entry_2 *de,
+			  int blocksize, int csum_size,
+			  unsigned int parent_ino, int dotdot_real_len)
+{
+	de->inode = cpu_to_le32(inode->i_ino);
+	de->name_len = 1;
+	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+					   blocksize);
+	strcpy(de->name, ".");
+	ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+	de = ext4_next_entry(de, blocksize);
+	de->inode = cpu_to_le32(parent_ino);
+	de->name_len = 2;
+	if (!dotdot_real_len)
+		de->rec_len = ext4_rec_len_to_disk(blocksize -
+					(csum_size + EXT4_DIR_REC_LEN(1)),
+					blocksize);
+	else
+		de->rec_len = ext4_rec_len_to_disk(
+				EXT4_DIR_REC_LEN(de->name_len), blocksize);
+	strcpy(de->name, "..");
+	ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+	return ext4_next_entry(de, blocksize);
+}
+
+static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+			     struct inode *inode)
+{
+	struct buffer_head *dir_block = NULL;
+	struct ext4_dir_entry_2 *de;
+	struct ext4_dir_entry_tail *t;
+	ext4_lblk_t block = 0;
+	unsigned int blocksize = dir->i_sb->s_blocksize;
+	int csum_size = 0;
+	int err;
 
-	inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+	if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		csum_size = sizeof(struct ext4_dir_entry_tail);
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+		err = ext4_try_create_inline_dir(handle, dir, inode);
+		if (err < 0 && err != -ENOSPC)
+			goto out;
+		if (!err)
+			goto out;
+	}
+
+	inode->i_size = 0;
+	dir_block = ext4_append(handle, inode, &block);
+	if (IS_ERR(dir_block))
+		return PTR_ERR(dir_block);
+	BUFFER_TRACE(dir_block, "get_write_access");
+	err = ext4_journal_get_write_access(handle, dir_block);
+	if (err)
+		goto out;
+	de = (struct ext4_dir_entry_2 *)dir_block->b_data;
+	ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
+	set_nlink(inode, 2);
+	if (csum_size) {
+		t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+		initialize_dirent_tail(t, blocksize);
+	}
+
+	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+	if (err)
+		goto out;
+	set_buffer_verified(dir_block);
+out:
+	brelse(dir_block);
+	return err;
+}
+
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, credits, retries = 0;
+
+	if (EXT4_DIR_LINK_MAX(dir))
+		return -EMLINK;
+
+	dquot_initialize(dir);
+
+	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
+retry:
+	inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
+					    &dentry->d_name,
+					    0, NULL, EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
 
 	inode->i_op = &ext4_dir_inode_operations;
 	inode->i_fop = &ext4_dir_operations;
-	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
-	dir_block = ext4_bread (handle, inode, 0, 1, &err);
-	if (!dir_block)
+	err = ext4_init_new_dir(handle, dir, inode);
+	if (err)
 		goto out_clear_inode;
-	BUFFER_TRACE(dir_block, "get_write_access");
-	ext4_journal_get_write_access(handle, dir_block);
-	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
-	de->inode = cpu_to_le32(inode->i_ino);
-	de->name_len = 1;
-	de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
-	strcpy (de->name, ".");
-	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-	de = ext4_next_entry(de);
-	de->inode = cpu_to_le32(dir->i_ino);
-	de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
-						EXT4_DIR_REC_LEN(1));
-	de->name_len = 2;
-	strcpy (de->name, "..");
-	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-	inode->i_nlink = 2;
-	BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
-	ext4_journal_dirty_metadata(handle, dir_block);
-	brelse (dir_block);
-	ext4_mark_inode_dirty(handle, inode);
-	err = ext4_add_entry (handle, dentry, inode);
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (!err)
+		err = ext4_add_entry(handle, dentry, inode);
 	if (err) {
 out_clear_inode:
 		clear_nlink(inode);
+		unlock_new_inode(inode);
 		ext4_mark_inode_dirty(handle, inode);
-		iput (inode);
+		iput(inode);
 		goto out_stop;
 	}
 	ext4_inc_count(handle, dir);
 	ext4_update_dx_flag(dir);
-	ext4_mark_inode_dirty(handle, dir);
+	err = ext4_mark_inode_dirty(handle, dir);
+	if (err)
+		goto out_clear_inode;
+	unlock_new_inode(inode);
 	d_instantiate(dentry, inode);
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
 out_stop:
-	ext4_journal_stop(handle);
+	if (handle)
+		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
@@ -1848,141 +2471,161 @@ out_stop:
 /*
  * routine to check that the specified directory is empty (for rmdir)
  */
-static int empty_dir (struct inode * inode)
+static int empty_dir(struct inode *inode)
 {
-	unsigned long offset;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de, * de1;
-	struct super_block * sb;
+	unsigned int offset;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de, *de1;
+	struct super_block *sb;
 	int err = 0;
 
+	if (ext4_has_inline_data(inode)) {
+		int has_inline_data = 1;
+
+		err = empty_inline_dir(inode, &has_inline_data);
+		if (has_inline_data)
+			return err;
+	}
+
 	sb = inode->i_sb;
-	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
-	    !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
-		if (err)
-			ext4_error(inode->i_sb, __FUNCTION__,
-				   "error %d reading directory #%lu offset 0",
-				   err, inode->i_ino);
-		else
-			ext4_warning(inode->i_sb, __FUNCTION__,
-				     "bad directory (dir #%lu) - no data block",
-				     inode->i_ino);
+	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
+		EXT4_ERROR_INODE(inode, "invalid size");
 		return 1;
 	}
+	bh = ext4_read_dirblock(inode, 0, EITHER);
+	if (IS_ERR(bh))
+		return 1;
+
 	de = (struct ext4_dir_entry_2 *) bh->b_data;
-	de1 = ext4_next_entry(de);
+	de1 = ext4_next_entry(de, sb->s_blocksize);
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
 			!le32_to_cpu(de1->inode) ||
-			strcmp (".", de->name) ||
-			strcmp ("..", de1->name)) {
-		ext4_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no `.' or `..'",
-			      inode->i_ino);
-		brelse (bh);
+			strcmp(".", de->name) ||
+			strcmp("..", de1->name)) {
+		ext4_warning(inode->i_sb,
+			     "bad directory (dir #%lu) - no `.' or `..'",
+			     inode->i_ino);
+		brelse(bh);
 		return 1;
 	}
-	offset = ext4_rec_len_from_disk(de->rec_len) +
-		 ext4_rec_len_from_disk(de1->rec_len);
-	de = ext4_next_entry(de1);
-	while (offset < inode->i_size ) {
-		if (!bh ||
-			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+	offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+		 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+	de = ext4_next_entry(de1, sb->s_blocksize);
+	while (offset < inode->i_size) {
+		if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+			unsigned int lblock;
 			err = 0;
-			brelse (bh);
-			bh = ext4_bread (NULL, inode,
-				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
-			if (!bh) {
-				if (err)
-					ext4_error(sb, __FUNCTION__,
-						   "error %d reading directory"
-						   " #%lu offset %lu",
-						   err, inode->i_ino, offset);
-				offset += sb->s_blocksize;
-				continue;
-			}
+			brelse(bh);
+			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+			bh = ext4_read_dirblock(inode, lblock, EITHER);
+			if (IS_ERR(bh))
+				return 1;
 			de = (struct ext4_dir_entry_2 *) bh->b_data;
 		}
-		if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+		if (ext4_check_dir_entry(inode, NULL, de, bh,
+					 bh->b_data, bh->b_size, offset)) {
 			de = (struct ext4_dir_entry_2 *)(bh->b_data +
 							 sb->s_blocksize);
 			offset = (offset | (sb->s_blocksize - 1)) + 1;
 			continue;
 		}
 		if (le32_to_cpu(de->inode)) {
-			brelse (bh);
+			brelse(bh);
 			return 0;
 		}
-		offset += ext4_rec_len_from_disk(de->rec_len);
-		de = ext4_next_entry(de);
+		offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+		de = ext4_next_entry(de, sb->s_blocksize);
 	}
-	brelse (bh);
+	brelse(bh);
 	return 1;
 }
 
-/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
  * such inodes, starting at the superblock, in case we crash before the
  * file is closed/deleted, or in case the inode truncate spans multiple
  * transactions and the last transaction is not recovered after a crash.
  *
  * At filesystem recovery time, we walk this list deleting unlinked
  * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_mutex unless
+ * we are just creating the inode or deleting it.
  */
 int ext4_orphan_add(handle_t *handle, struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_iloc iloc;
 	int err = 0, rc;
+	bool dirty = false;
 
-	lock_super(sb);
-	if (!list_empty(&EXT4_I(inode)->i_orphan))
-		goto out_unlock;
+	if (!sbi->s_journal)
+		return 0;
 
-	/* Orphan handling is only valid for files with data blocks
-	 * being truncated, or files being unlinked. */
+	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+		     !mutex_is_locked(&inode->i_mutex));
+	/*
+	 * Exit early if inode already is on orphan list. This is a big speedup
+	 * since we don't have to contend on the global s_orphan_lock.
+	 */
+	if (!list_empty(&EXT4_I(inode)->i_orphan))
+		return 0;
 
-	/* @@@ FIXME: Observation from aviro:
-	 * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
-	 * here (on lock_super()), so race with ext4_link() which might bump
-	 * ->i_nlink. For, say it, character device. Not a regular file,
-	 * not a directory, not a symlink and ->i_nlink > 0.
+	/*
+	 * Orphan handling is only valid for files with data blocks
+	 * being truncated, or files being unlinked. Note that we either
+	 * hold i_mutex, or the inode can not be referenced from outside,
+	 * so i_nlink should not be bumped due to race
 	 */
-	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
 
-	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
 	if (err)
-		goto out_unlock;
+		goto out;
 
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (err)
-		goto out_unlock;
-
-	/* Insert this inode at the head of the on-disk orphan list... */
-	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
-	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-	err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
-	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
-	if (!err)
-		err = rc;
-
-	/* Only add to the head of the in-memory list if all the
-	 * previous operations succeeded.  If the orphan_add is going to
-	 * fail (possibly taking the journal offline), we can't risk
-	 * leaving the inode on the orphan list: stray orphan-list
-	 * entries can cause panics at unmount time.
-	 *
-	 * This is safe: on error we're going to ignore the orphan list
-	 * anyway on the next recovery. */
-	if (!err)
-		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+		goto out;
 
+	mutex_lock(&sbi->s_orphan_lock);
+	/*
+	 * Due to previous errors inode may be already a part of on-disk
+	 * orphan list. If so skip on-disk list modification.
+	 */
+	if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+	    (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+		/* Insert this inode at the head of the on-disk orphan list */
+		NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+		sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+		dirty = true;
+	}
+	list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+	mutex_unlock(&sbi->s_orphan_lock);
+
+	if (dirty) {
+		err = ext4_handle_dirty_super(handle, sb);
+		rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+		if (!err)
+			err = rc;
+		if (err) {
+			/*
+			 * We have to remove inode from in-memory list if
+			 * addition to on disk orphan list failed. Stray orphan
+			 * list entries can cause panics at unmount time.
+			 */
+			mutex_lock(&sbi->s_orphan_lock);
+			list_del(&EXT4_I(inode)->i_orphan);
+			mutex_unlock(&sbi->s_orphan_lock);
+		}
+	}
 	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
 	jbd_debug(4, "orphan inode %lu will point to %d\n",
 			inode->i_ino, NEXT_ORPHAN(inode));
-out_unlock:
-	unlock_super(sb);
-	ext4_std_error(inode->i_sb, err);
+out:
+	ext4_std_error(sb, err);
 	return err;
 }
 
@@ -1994,66 +2637,74 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 {
 	struct list_head *prev;
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_sb_info *sbi;
-	unsigned long ino_next;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	__u32 ino_next;
 	struct ext4_iloc iloc;
 	int err = 0;
 
-	lock_super(inode->i_sb);
-	if (list_empty(&ei->i_orphan)) {
-		unlock_super(inode->i_sb);
+	if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
 		return 0;
-	}
 
-	ino_next = NEXT_ORPHAN(inode);
-	prev = ei->i_orphan.prev;
-	sbi = EXT4_SB(inode->i_sb);
+	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+		     !mutex_is_locked(&inode->i_mutex));
+	/* Do this quick check before taking global s_orphan_lock. */
+	if (list_empty(&ei->i_orphan))
+		return 0;
+
+	if (handle) {
+		/* Grab inode buffer early before taking global s_orphan_lock */
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+	}
 
+	mutex_lock(&sbi->s_orphan_lock);
 	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
 
+	prev = ei->i_orphan.prev;
 	list_del_init(&ei->i_orphan);
 
 	/* If we're on an error path, we may not have a valid
 	 * transaction handle with which to update the orphan list on
 	 * disk, but we still need to remove the inode from the linked
 	 * list in memory. */
-	if (!handle)
-		goto out;
-
-	err = ext4_reserve_inode_write(handle, inode, &iloc);
-	if (err)
+	if (!handle || err) {
+		mutex_unlock(&sbi->s_orphan_lock);
 		goto out_err;
+	}
 
+	ino_next = NEXT_ORPHAN(inode);
 	if (prev == &sbi->s_orphan) {
-		jbd_debug(4, "superblock will point to %lu\n", ino_next);
+		jbd_debug(4, "superblock will point to %u\n", ino_next);
 		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
-		if (err)
+		if (err) {
+			mutex_unlock(&sbi->s_orphan_lock);
 			goto out_brelse;
+		}
 		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-		err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+		mutex_unlock(&sbi->s_orphan_lock);
+		err = ext4_handle_dirty_super(handle, inode->i_sb);
 	} else {
 		struct ext4_iloc iloc2;
 		struct inode *i_prev =
 			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
 
-		jbd_debug(4, "orphan inode %lu will point to %lu\n",
+		jbd_debug(4, "orphan inode %lu will point to %u\n",
 			  i_prev->i_ino, ino_next);
 		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
-		if (err)
+		if (err) {
+			mutex_unlock(&sbi->s_orphan_lock);
 			goto out_brelse;
+		}
 		NEXT_ORPHAN(i_prev) = ino_next;
 		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+		mutex_unlock(&sbi->s_orphan_lock);
 	}
 	if (err)
 		goto out_brelse;
 	NEXT_ORPHAN(inode) = 0;
 	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-
 out_err:
 	ext4_std_error(inode->i_sb, err);
-out:
-	unlock_super(inode->i_sb);
 	return err;
 
 out_brelse:
@@ -2061,29 +2712,24 @@ out_brelse:
 	goto out_err;
 }
 
-static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de;
-	handle_t *handle;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+	handle_t *handle = NULL;
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
-	DQUOT_INIT(dentry->d_inode);
-	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
 
 	retval = -ENOENT;
-	bh = ext4_find_entry (dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
 	if (!bh)
 		goto end_rmdir;
 
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
-
 	inode = dentry->d_inode;
 
 	retval = -EIO;
@@ -2091,16 +2737,27 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 		goto end_rmdir;
 
 	retval = -ENOTEMPTY;
-	if (!empty_dir (inode))
+	if (!empty_dir(inode))
 		goto end_rmdir;
 
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		handle = NULL;
+		goto end_rmdir;
+	}
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_rmdir;
 	if (!EXT4_DIR_LINK_EMPTY(inode))
-		ext4_warning (inode->i_sb, "ext4_rmdir",
-			      "empty directory has too many links (%d)",
-			      inode->i_nlink);
+		ext4_warning(inode->i_sb,
+			     "empty directory has too many links (%d)",
+			     inode->i_nlink);
 	inode->i_version++;
 	clear_nlink(inode);
 	/* There's no need to set i_disksize: the fact that i_nlink is
@@ -2115,31 +2772,28 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
 	ext4_mark_inode_dirty(handle, dir);
 
 end_rmdir:
-	ext4_journal_stop(handle);
-	brelse (bh);
+	brelse(bh);
+	if (handle)
+		ext4_journal_stop(handle);
 	return retval;
 }
 
-static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int retval;
-	struct inode * inode;
-	struct buffer_head * bh;
-	struct ext4_dir_entry_2 * de;
-	handle_t *handle;
+	struct inode *inode;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+	handle_t *handle = NULL;
 
+	trace_ext4_unlink_enter(dir, dentry);
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	DQUOT_INIT(dentry->d_inode);
-	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+	dquot_initialize(dir);
+	dquot_initialize(dentry->d_inode);
 
 	retval = -ENOENT;
-	bh = ext4_find_entry (dentry, &de);
+	bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
 	if (!bh)
 		goto end_unlink;
 
@@ -2149,11 +2803,22 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 	if (le32_to_cpu(de->inode) != inode->i_ino)
 		goto end_unlink;
 
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+				    EXT4_DATA_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle)) {
+		retval = PTR_ERR(handle);
+		handle = NULL;
+		goto end_unlink;
+	}
+
+	if (IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
 	if (!inode->i_nlink) {
-		ext4_warning (inode->i_sb, "ext4_unlink",
-			      "Deleting nonexistent file (%lu), %d",
-			      inode->i_ino, inode->i_nlink);
-		inode->i_nlink = 1;
+		ext4_warning(inode->i_sb,
+			     "Deleting nonexistent file (%lu), %d",
+			     inode->i_ino, inode->i_nlink);
+		set_nlink(inode, 1);
 	}
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
@@ -2169,264 +2834,614 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
 	retval = 0;
 
 end_unlink:
-	ext4_journal_stop(handle);
-	brelse (bh);
+	brelse(bh);
+	if (handle)
+		ext4_journal_stop(handle);
+	trace_ext4_unlink_exit(dentry, retval);
 	return retval;
 }
 
-static int ext4_symlink (struct inode * dir,
-		struct dentry *dentry, const char * symname)
+static int ext4_symlink(struct inode *dir,
+			struct dentry *dentry, const char *symname)
 {
 	handle_t *handle;
-	struct inode * inode;
+	struct inode *inode;
 	int l, err, retries = 0;
+	int credits;
 
 	l = strlen(symname)+1;
 	if (l > dir->i_sb->s_blocksize)
 		return -ENAMETOOLONG;
 
-retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+	dquot_initialize(dir);
 
-	inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+	if (l > EXT4_N_BLOCKS * 4) {
+		/*
+		 * For non-fast symlinks, we just allocate inode and put it on
+		 * orphan list in the first transaction => we need bitmap,
+		 * group descriptor, sb, inode block, quota blocks, and
+		 * possibly selinux xattr blocks.
+		 */
+		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  EXT4_XATTR_TRANS_BLOCKS;
+	} else {
+		/*
+		 * Fast symlink. We have to add entry to directory
+		 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+		 * allocate new inode (bitmap, group descriptor, inode block,
+		 * quota blocks, sb is already counted in previous macros).
+		 */
+		credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+			  EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
+	}
+retry:
+	inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
+					    &dentry->d_name, 0, NULL,
+					    EXT4_HT_DIR, credits);
+	handle = ext4_journal_current_handle();
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
 
-	if (l > sizeof (EXT4_I(inode)->i_data)) {
+	if (l > EXT4_N_BLOCKS * 4) {
 		inode->i_op = &ext4_symlink_inode_operations;
 		ext4_set_aops(inode);
 		/*
-		 * page_symlink() calls into ext4_prepare/commit_write.
-		 * We have a transaction open.  All is sweetness.  It also sets
-		 * i_size in generic_commit_write().
+		 * We cannot call page_symlink() with transaction started
+		 * because it calls into ext4_write_begin() which can wait
+		 * for transaction commit if we are running out of space
+		 * and thus we deadlock. So we have to stop transaction now
+		 * and restart it when symlink contents is written.
+		 * 
+		 * To keep fs consistent in case of crash, we have to put inode
+		 * to orphan list in the mean time.
+		 */
+		drop_nlink(inode);
+		err = ext4_orphan_add(handle, inode);
+		ext4_journal_stop(handle);
+		if (err)
+			goto err_drop_inode;
+		err = __page_symlink(inode, symname, l, 1);
+		if (err)
+			goto err_drop_inode;
+		/*
+		 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+		 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
 		 */
-		err = __page_symlink(inode, symname, l,
-				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+		handle = ext4_journal_start(dir, EXT4_HT_DIR,
+				EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+				EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto err_drop_inode;
+		}
+		set_nlink(inode, 1);
+		err = ext4_orphan_del(handle, inode);
 		if (err) {
+			ext4_journal_stop(handle);
 			clear_nlink(inode);
-			ext4_mark_inode_dirty(handle, inode);
-			iput (inode);
-			goto out_stop;
+			goto err_drop_inode;
 		}
 	} else {
+		/* clear the extent format for fast symlink */
+		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
 		inode->i_op = &ext4_fast_symlink_inode_operations;
-		memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
 		inode->i_size = l-1;
 	}
 	EXT4_I(inode)->i_disksize = inode->i_size;
 	err = ext4_add_nondir(handle, dentry, inode);
+	if (!err && IS_DIRSYNC(dir))
+		ext4_handle_sync(handle);
+
 out_stop:
-	ext4_journal_stop(handle);
+	if (handle)
+		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
+err_drop_inode:
+	unlock_new_inode(inode);
+	iput(inode);
+	return err;
 }
 
-static int ext4_link (struct dentry * old_dentry,
-		struct inode * dir, struct dentry *dentry)
+static int ext4_link(struct dentry *old_dentry,
+		     struct inode *dir, struct dentry *dentry)
 {
 	handle_t *handle;
 	struct inode *inode = old_dentry->d_inode;
 	int err, retries = 0;
 
-	if (EXT4_DIR_LINK_MAX(inode))
+	if (inode->i_nlink >= EXT4_LINK_MAX)
 		return -EMLINK;
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 */
-	if (inode->i_nlink == 0)
-		return -ENOENT;
+	dquot_initialize(dir);
 
 retry:
-	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+	handle = ext4_journal_start(dir, EXT4_HT_DIR,
+		(EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+		 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode->i_ctime = ext4_current_time(inode);
 	ext4_inc_count(handle, inode);
-	atomic_inc(&inode->i_count);
+	ihold(inode);
 
-	err = ext4_add_nondir(handle, dentry, inode);
+	err = ext4_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext4_mark_inode_dirty(handle, inode);
+		/* this can happen only for tmpfile being
+		 * linked the first time
+		 */
+		if (inode->i_nlink == 1)
+			ext4_orphan_del(handle, inode);
+		d_instantiate(dentry, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
 	ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 	return err;
 }
 
-#define PARENT_INO(buffer) \
-	(ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+
+/*
+ * Try to find buffer head where contains the parent block.
+ * It should be the inode block if it is inlined or the 1st block
+ * if it is a normal dir.
+ */
+static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
+					struct inode *inode,
+					int *retval,
+					struct ext4_dir_entry_2 **parent_de,
+					int *inlined)
+{
+	struct buffer_head *bh;
+
+	if (!ext4_has_inline_data(inode)) {
+		bh = ext4_read_dirblock(inode, 0, EITHER);
+		if (IS_ERR(bh)) {
+			*retval = PTR_ERR(bh);
+			return NULL;
+		}
+		*parent_de = ext4_next_entry(
+					(struct ext4_dir_entry_2 *)bh->b_data,
+					inode->i_sb->s_blocksize);
+		return bh;
+	}
+
+	*inlined = 1;
+	return ext4_get_first_inline_block(inode, parent_de, retval);
+}
+
+struct ext4_renament {
+	struct inode *dir;
+	struct dentry *dentry;
+	struct inode *inode;
+	bool is_dir;
+	int dir_nlink_delta;
+
+	/* entry for "dentry" */
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+	int inlined;
+
+	/* entry for ".." in inode if it's a directory */
+	struct buffer_head *dir_bh;
+	struct ext4_dir_entry_2 *parent_de;
+	int dir_inlined;
+};
+
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+{
+	int retval;
+
+	ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
+					      &retval, &ent->parent_de,
+					      &ent->dir_inlined);
+	if (!ent->dir_bh)
+		return retval;
+	if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
+		return -EIO;
+	BUFFER_TRACE(ent->dir_bh, "get_write_access");
+	return ext4_journal_get_write_access(handle, ent->dir_bh);
+}
+
+static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
+				  unsigned dir_ino)
+{
+	int retval;
+
+	ent->parent_de->inode = cpu_to_le32(dir_ino);
+	BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
+	if (!ent->dir_inlined) {
+		if (is_dx(ent->inode)) {
+			retval = ext4_handle_dirty_dx_node(handle,
+							   ent->inode,
+							   ent->dir_bh);
+		} else {
+			retval = ext4_handle_dirty_dirent_node(handle,
+							       ent->inode,
+							       ent->dir_bh);
+		}
+	} else {
+		retval = ext4_mark_inode_dirty(handle, ent->inode);
+	}
+	if (retval) {
+		ext4_std_error(ent->dir->i_sb, retval);
+		return retval;
+	}
+	return 0;
+}
+
+static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
+		       unsigned ino, unsigned file_type)
+{
+	int retval;
+
+	BUFFER_TRACE(ent->bh, "get write access");
+	retval = ext4_journal_get_write_access(handle, ent->bh);
+	if (retval)
+		return retval;
+	ent->de->inode = cpu_to_le32(ino);
+	if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
+				      EXT4_FEATURE_INCOMPAT_FILETYPE))
+		ent->de->file_type = file_type;
+	ent->dir->i_version++;
+	ent->dir->i_ctime = ent->dir->i_mtime =
+		ext4_current_time(ent->dir);
+	ext4_mark_inode_dirty(handle, ent->dir);
+	BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
+	if (!ent->inlined) {
+		retval = ext4_handle_dirty_dirent_node(handle,
+						       ent->dir, ent->bh);
+		if (unlikely(retval)) {
+			ext4_std_error(ent->dir->i_sb, retval);
+			return retval;
+		}
+	}
+	brelse(ent->bh);
+	ent->bh = NULL;
+
+	return 0;
+}
+
+static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
+				  const struct qstr *d_name)
+{
+	int retval = -ENOENT;
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de;
+
+	bh = ext4_find_entry(dir, d_name, &de, NULL);
+	if (bh) {
+		retval = ext4_delete_entry(handle, dir, de, bh);
+		brelse(bh);
+	}
+	return retval;
+}
+
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+{
+	int retval;
+	/*
+	 * ent->de could have moved from under us during htree split, so make
+	 * sure that we are deleting the right entry.  We might also be pointing
+	 * to a stale entry in the unused part of ent->bh so just checking inum
+	 * and the name isn't enough.
+	 */
+	if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
+	    ent->de->name_len != ent->dentry->d_name.len ||
+	    strncmp(ent->de->name, ent->dentry->d_name.name,
+		    ent->de->name_len)) {
+		retval = ext4_find_delete_entry(handle, ent->dir,
+						&ent->dentry->d_name);
+	} else {
+		retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
+		if (retval == -ENOENT) {
+			retval = ext4_find_delete_entry(handle, ent->dir,
+							&ent->dentry->d_name);
+		}
+	}
+
+	if (retval) {
+		ext4_warning(ent->dir->i_sb,
+				"Deleting old file (%lu), %d, error=%d",
+				ent->dir->i_ino, ent->dir->i_nlink, retval);
+	}
+}
+
+static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
+{
+	if (ent->dir_nlink_delta) {
+		if (ent->dir_nlink_delta == -1)
+			ext4_dec_count(handle, ent->dir);
+		else
+			ext4_inc_count(handle, ent->dir);
+		ext4_mark_inode_dirty(handle, ent->dir);
+	}
+}
 
 /*
  * Anybody can rename anything with this: the permission checks are left to the
  * higher-level routines.
+ *
+ * n.b.  old_{dentry,inode) refers to the source dentry/inode
+ * while new_{dentry,inode) refers to the destination dentry/inode
+ * This comes from rename(const char *oldpath, const char *newpath)
  */
-static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
-			   struct inode * new_dir,struct dentry *new_dentry)
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
 {
-	handle_t *handle;
-	struct inode * old_inode, * new_inode;
-	struct buffer_head * old_bh, * new_bh, * dir_bh;
-	struct ext4_dir_entry_2 * old_de, * new_de;
+	handle_t *handle = NULL;
+	struct ext4_renament old = {
+		.dir = old_dir,
+		.dentry = old_dentry,
+		.inode = old_dentry->d_inode,
+	};
+	struct ext4_renament new = {
+		.dir = new_dir,
+		.dentry = new_dentry,
+		.inode = new_dentry->d_inode,
+	};
 	int retval;
 
-	old_bh = new_bh = dir_bh = NULL;
+	dquot_initialize(old.dir);
+	dquot_initialize(new.dir);
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	if (new_dentry->d_inode)
-		DQUOT_INIT(new_dentry->d_inode);
-	handle = ext4_journal_start(old_dir, 2 *
-					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
-					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-
-	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-		handle->h_sync = 1;
+	if (new.inode)
+		dquot_initialize(new.inode);
 
-	old_bh = ext4_find_entry (old_dentry, &old_de);
+	old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
 	 *  We might rmdir the source, keep it as pwd of some process
 	 *  and merrily kill the link to whatever was created under the
 	 *  same name. Goodbye sticky bit ;-<
 	 */
-	old_inode = old_dentry->d_inode;
 	retval = -ENOENT;
-	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+	if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
 		goto end_rename;
 
-	new_inode = new_dentry->d_inode;
-	new_bh = ext4_find_entry (new_dentry, &new_de);
-	if (new_bh) {
-		if (!new_inode) {
-			brelse (new_bh);
-			new_bh = NULL;
+	new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+				 &new.de, &new.inlined);
+	if (new.bh) {
+		if (!new.inode) {
+			brelse(new.bh);
+			new.bh = NULL;
 		}
 	}
-	if (S_ISDIR(old_inode->i_mode)) {
-		if (new_inode) {
+	if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
+		ext4_alloc_da_blocks(old.inode);
+
+	handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+		(2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+		 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+		ext4_handle_sync(handle);
+
+	if (S_ISDIR(old.inode->i_mode)) {
+		if (new.inode) {
 			retval = -ENOTEMPTY;
-			if (!empty_dir (new_inode))
+			if (!empty_dir(new.inode))
+				goto end_rename;
+		} else {
+			retval = -EMLINK;
+			if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
 				goto end_rename;
 		}
-		retval = -EIO;
-		dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
-		if (!dir_bh)
-			goto end_rename;
-		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
-			goto end_rename;
-		retval = -EMLINK;
-		if (!new_inode && new_dir!=old_dir &&
-				new_dir->i_nlink >= EXT4_LINK_MAX)
+		retval = ext4_rename_dir_prepare(handle, &old);
+		if (retval)
 			goto end_rename;
 	}
-	if (!new_bh) {
-		retval = ext4_add_entry (handle, new_dentry, old_inode);
+	if (!new.bh) {
+		retval = ext4_add_entry(handle, new.dentry, old.inode);
 		if (retval)
 			goto end_rename;
 	} else {
-		BUFFER_TRACE(new_bh, "get write access");
-		ext4_journal_get_write_access(handle, new_bh);
-		new_de->inode = cpu_to_le32(old_inode->i_ino);
-		if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
-					      EXT4_FEATURE_INCOMPAT_FILETYPE))
-			new_de->file_type = old_de->file_type;
-		new_dir->i_version++;
-		BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, new_bh);
-		brelse(new_bh);
-		new_bh = NULL;
+		retval = ext4_setent(handle, &new,
+				     old.inode->i_ino, old.de->file_type);
+		if (retval)
+			goto end_rename;
 	}
 
 	/*
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
 	 */
-	old_inode->i_ctime = ext4_current_time(old_inode);
-	ext4_mark_inode_dirty(handle, old_inode);
+	old.inode->i_ctime = ext4_current_time(old.inode);
+	ext4_mark_inode_dirty(handle, old.inode);
 
 	/*
 	 * ok, that's it
 	 */
-	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
-	    old_de->name_len != old_dentry->d_name.len ||
-	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
-	    (retval = ext4_delete_entry(handle, old_dir,
-					old_de, old_bh)) == -ENOENT) {
-		/* old_de could have moved from under us during htree split, so
-		 * make sure that we are deleting the right entry.  We might
-		 * also be pointing to a stale entry in the unused part of
-		 * old_bh so just checking inum and the name isn't enough. */
-		struct buffer_head *old_bh2;
-		struct ext4_dir_entry_2 *old_de2;
-
-		old_bh2 = ext4_find_entry(old_dentry, &old_de2);
-		if (old_bh2) {
-			retval = ext4_delete_entry(handle, old_dir,
-						   old_de2, old_bh2);
-			brelse(old_bh2);
-		}
+	ext4_rename_delete(handle, &old);
+
+	if (new.inode) {
+		ext4_dec_count(handle, new.inode);
+		new.inode->i_ctime = ext4_current_time(new.inode);
 	}
-	if (retval) {
-		ext4_warning(old_dir->i_sb, "ext4_rename",
-				"Deleting old file (%lu), %d, error=%d",
-				old_dir->i_ino, old_dir->i_nlink, retval);
-	}
-
-	if (new_inode) {
-		ext4_dec_count(handle, new_inode);
-		new_inode->i_ctime = ext4_current_time(new_inode);
-	}
-	old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
-	ext4_update_dx_flag(old_dir);
-	if (dir_bh) {
-		BUFFER_TRACE(dir_bh, "get_write_access");
-		ext4_journal_get_write_access(handle, dir_bh);
-		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
-		BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, dir_bh);
-		ext4_dec_count(handle, old_dir);
-		if (new_inode) {
+	old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+	ext4_update_dx_flag(old.dir);
+	if (old.dir_bh) {
+		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+		if (retval)
+			goto end_rename;
+
+		ext4_dec_count(handle, old.dir);
+		if (new.inode) {
 			/* checked empty_dir above, can't have another parent,
 			 * ext4_dec_count() won't work for many-linked dirs */
-			new_inode->i_nlink = 0;
+			clear_nlink(new.inode);
 		} else {
-			ext4_inc_count(handle, new_dir);
-			ext4_update_dx_flag(new_dir);
-			ext4_mark_inode_dirty(handle, new_dir);
+			ext4_inc_count(handle, new.dir);
+			ext4_update_dx_flag(new.dir);
+			ext4_mark_inode_dirty(handle, new.dir);
 		}
 	}
-	ext4_mark_inode_dirty(handle, old_dir);
-	if (new_inode) {
-		ext4_mark_inode_dirty(handle, new_inode);
-		if (!new_inode->i_nlink)
-			ext4_orphan_add(handle, new_inode);
+	ext4_mark_inode_dirty(handle, old.dir);
+	if (new.inode) {
+		ext4_mark_inode_dirty(handle, new.inode);
+		if (!new.inode->i_nlink)
+			ext4_orphan_add(handle, new.inode);
 	}
 	retval = 0;
 
 end_rename:
-	brelse (dir_bh);
-	brelse (old_bh);
-	brelse (new_bh);
-	ext4_journal_stop(handle);
+	brelse(old.dir_bh);
+	brelse(old.bh);
+	brelse(new.bh);
+	if (handle)
+		ext4_journal_stop(handle);
+	return retval;
+}
+
+static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry)
+{
+	handle_t *handle = NULL;
+	struct ext4_renament old = {
+		.dir = old_dir,
+		.dentry = old_dentry,
+		.inode = old_dentry->d_inode,
+	};
+	struct ext4_renament new = {
+		.dir = new_dir,
+		.dentry = new_dentry,
+		.inode = new_dentry->d_inode,
+	};
+	u8 new_file_type;
+	int retval;
+
+	dquot_initialize(old.dir);
+	dquot_initialize(new.dir);
+
+	old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
+				 &old.de, &old.inlined);
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	retval = -ENOENT;
+	if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
+		goto end_rename;
+
+	new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+				 &new.de, &new.inlined);
+
+	/* RENAME_EXCHANGE case: old *and* new must both exist */
+	if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
+		goto end_rename;
+
+	handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+		(2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+		 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+		ext4_handle_sync(handle);
+
+	if (S_ISDIR(old.inode->i_mode)) {
+		old.is_dir = true;
+		retval = ext4_rename_dir_prepare(handle, &old);
+		if (retval)
+			goto end_rename;
+	}
+	if (S_ISDIR(new.inode->i_mode)) {
+		new.is_dir = true;
+		retval = ext4_rename_dir_prepare(handle, &new);
+		if (retval)
+			goto end_rename;
+	}
+
+	/*
+	 * Other than the special case of overwriting a directory, parents'
+	 * nlink only needs to be modified if this is a cross directory rename.
+	 */
+	if (old.dir != new.dir && old.is_dir != new.is_dir) {
+		old.dir_nlink_delta = old.is_dir ? -1 : 1;
+		new.dir_nlink_delta = -old.dir_nlink_delta;
+		retval = -EMLINK;
+		if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
+		    (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
+			goto end_rename;
+	}
+
+	new_file_type = new.de->file_type;
+	retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
+	if (retval)
+		goto end_rename;
+
+	retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
+	if (retval)
+		goto end_rename;
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old.inode->i_ctime = ext4_current_time(old.inode);
+	new.inode->i_ctime = ext4_current_time(new.inode);
+	ext4_mark_inode_dirty(handle, old.inode);
+	ext4_mark_inode_dirty(handle, new.inode);
+
+	if (old.dir_bh) {
+		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+		if (retval)
+			goto end_rename;
+	}
+	if (new.dir_bh) {
+		retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
+		if (retval)
+			goto end_rename;
+	}
+	ext4_update_dir_count(handle, &old);
+	ext4_update_dir_count(handle, &new);
+	retval = 0;
+
+end_rename:
+	brelse(old.dir_bh);
+	brelse(new.dir_bh);
+	brelse(old.bh);
+	brelse(new.bh);
+	if (handle)
+		ext4_journal_stop(handle);
 	return retval;
 }
 
+static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir, struct dentry *new_dentry,
+			unsigned int flags)
+{
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (flags & RENAME_EXCHANGE) {
+		return ext4_cross_rename(old_dir, old_dentry,
+					 new_dir, new_dentry);
+	}
+	/*
+	 * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
+	 * is equivalent to regular rename.
+	 */
+	return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 /*
  * directories can handle most operations...
  */
@@ -2439,24 +3454,25 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.mkdir		= ext4_mkdir,
 	.rmdir		= ext4_rmdir,
 	.mknod		= ext4_mknod,
+	.tmpfile	= ext4_tmpfile,
 	.rename		= ext4_rename,
+	.rename2	= ext4_rename2,
 	.setattr	= ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
-#endif
-	.permission	= ext4_permission,
+	.get_acl	= ext4_get_acl,
+	.set_acl	= ext4_set_acl,
+	.fiemap         = ext4_fiemap,
 };
 
 const struct inode_operations ext4_special_inode_operations = {
 	.setattr	= ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
-#endif
-	.permission	= ext4_permission,
+	.get_acl	= ext4_get_acl,
+	.set_acl	= ext4_set_acl,
 };
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a0..00000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*  linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- *	Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
new file mode 100644
index 00000000000..b24a2541a9b
--- /dev/null
+++ b/fs/ext4/page-io.c
@@ -0,0 +1,497 @@
+/*
+ * linux/fs/ext4/page-io.c
+ *
+ * This contains the new page_io functions for ext4
+ *
+ * Written by Theodore Ts'o, 2010.
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/aio.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/ratelimit.h>
+
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+
+static struct kmem_cache *io_end_cachep;
+
+int __init ext4_init_pageio(void)
+{
+	io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT);
+	if (io_end_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void ext4_exit_pageio(void)
+{
+	kmem_cache_destroy(io_end_cachep);
+}
+
+/*
+ * Print an buffer I/O error compatible with the fs/buffer.c.  This
+ * provides compatibility with dmesg scrapers that look for a specific
+ * buffer I/O error message.  We really need a unified error reporting
+ * structure to userspace ala Digital Unix's uerf system, but it's
+ * probably not going to happen in my lifetime, due to LKML politics...
+ */
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+	printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+static void ext4_finish_bio(struct bio *bio)
+{
+	int i;
+	int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+		struct buffer_head *bh, *head;
+		unsigned bio_start = bvec->bv_offset;
+		unsigned bio_end = bio_start + bvec->bv_len;
+		unsigned under_io = 0;
+		unsigned long flags;
+
+		if (!page)
+			continue;
+
+		if (error) {
+			SetPageError(page);
+			set_bit(AS_EIO, &page->mapping->flags);
+		}
+		bh = head = page_buffers(page);
+		/*
+		 * We check all buffers in the page under BH_Uptodate_Lock
+		 * to avoid races with other end io clearing async_write flags
+		 */
+		local_irq_save(flags);
+		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+		do {
+			if (bh_offset(bh) < bio_start ||
+			    bh_offset(bh) + bh->b_size > bio_end) {
+				if (buffer_async_write(bh))
+					under_io++;
+				continue;
+			}
+			clear_buffer_async_write(bh);
+			if (error)
+				buffer_io_error(bh);
+		} while ((bh = bh->b_this_page) != head);
+		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
+		local_irq_restore(flags);
+		if (!under_io)
+			end_page_writeback(page);
+	}
+}
+
+static void ext4_release_io_end(ext4_io_end_t *io_end)
+{
+	struct bio *bio, *next_bio;
+
+	BUG_ON(!list_empty(&io_end->list));
+	BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
+	WARN_ON(io_end->handle);
+
+	if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
+		wake_up_all(ext4_ioend_wq(io_end->inode));
+
+	for (bio = io_end->bio; bio; bio = next_bio) {
+		next_bio = bio->bi_private;
+		ext4_finish_bio(bio);
+		bio_put(bio);
+	}
+	kmem_cache_free(io_end_cachep, io_end);
+}
+
+static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
+{
+	struct inode *inode = io_end->inode;
+
+	io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
+	/* Wake up anyone waiting on unwritten extent conversion */
+	if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
+		wake_up_all(ext4_ioend_wq(inode));
+}
+
+/*
+ * Check a range of space and convert unwritten extents to written. Note that
+ * we are protected from truncate touching same part of extent tree by the
+ * fact that truncate code waits for all DIO to finish (thus exclusion from
+ * direct IO is achieved) and also waits for PageWriteback bits. Thus we
+ * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
+ * completed (happens from ext4_free_ioend()).
+ */
+static int ext4_end_io(ext4_io_end_t *io)
+{
+	struct inode *inode = io->inode;
+	loff_t offset = io->offset;
+	ssize_t size = io->size;
+	handle_t *handle = io->handle;
+	int ret = 0;
+
+	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+		   "list->prev 0x%p\n",
+		   io, inode->i_ino, io->list.next, io->list.prev);
+
+	io->handle = NULL;	/* Following call will use up the handle */
+	ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
+	if (ret < 0) {
+		ext4_msg(inode->i_sb, KERN_EMERG,
+			 "failed to convert unwritten extents to written "
+			 "extents -- potential data loss!  "
+			 "(inode %lu, offset %llu, size %zd, error %d)",
+			 inode->i_ino, offset, size, ret);
+	}
+	ext4_clear_io_unwritten_flag(io);
+	ext4_release_io_end(io);
+	return ret;
+}
+
+static void dump_completed_IO(struct inode *inode, struct list_head *head)
+{
+#ifdef	EXT4FS_DEBUG
+	struct list_head *cur, *before, *after;
+	ext4_io_end_t *io, *io0, *io1;
+
+	if (list_empty(head))
+		return;
+
+	ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
+	list_for_each_entry(io, head, list) {
+		cur = &io->list;
+		before = cur->prev;
+		io0 = container_of(before, ext4_io_end_t, list);
+		after = cur->next;
+		io1 = container_of(after, ext4_io_end_t, list);
+
+		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+			    io, inode->i_ino, io0, io1);
+	}
+#endif
+}
+
+/* Add the io_end to per-inode completed end_io list. */
+static void ext4_add_complete_io(ext4_io_end_t *io_end)
+{
+	struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+	struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
+	struct workqueue_struct *wq;
+	unsigned long flags;
+
+	/* Only reserved conversions from writeback should enter here */
+	WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+	WARN_ON(!io_end->handle && sbi->s_journal);
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	wq = sbi->rsv_conversion_wq;
+	if (list_empty(&ei->i_rsv_conversion_list))
+		queue_work(wq, &ei->i_rsv_conversion_work);
+	list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
+
+static int ext4_do_flush_completed_IO(struct inode *inode,
+				      struct list_head *head)
+{
+	ext4_io_end_t *io;
+	struct list_head unwritten;
+	unsigned long flags;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int err, ret = 0;
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	dump_completed_IO(inode, head);
+	list_replace_init(head, &unwritten);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+	while (!list_empty(&unwritten)) {
+		io = list_entry(unwritten.next, ext4_io_end_t, list);
+		BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
+		list_del_init(&io->list);
+
+		err = ext4_end_io(io);
+		if (unlikely(!ret && err))
+			ret = err;
+	}
+	return ret;
+}
+
+/*
+ * work on completed IO, to convert unwritten extents to extents
+ */
+void ext4_end_io_rsv_work(struct work_struct *work)
+{
+	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+						  i_rsv_conversion_work);
+	ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
+}
+
+ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
+{
+	ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
+	if (io) {
+		atomic_inc(&EXT4_I(inode)->i_ioend_count);
+		io->inode = inode;
+		INIT_LIST_HEAD(&io->list);
+		atomic_set(&io->count, 1);
+	}
+	return io;
+}
+
+void ext4_put_io_end_defer(ext4_io_end_t *io_end)
+{
+	if (atomic_dec_and_test(&io_end->count)) {
+		if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
+			ext4_release_io_end(io_end);
+			return;
+		}
+		ext4_add_complete_io(io_end);
+	}
+}
+
+int ext4_put_io_end(ext4_io_end_t *io_end)
+{
+	int err = 0;
+
+	if (atomic_dec_and_test(&io_end->count)) {
+		if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+			err = ext4_convert_unwritten_extents(io_end->handle,
+						io_end->inode, io_end->offset,
+						io_end->size);
+			io_end->handle = NULL;
+			ext4_clear_io_unwritten_flag(io_end);
+		}
+		ext4_release_io_end(io_end);
+	}
+	return err;
+}
+
+ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
+{
+	atomic_inc(&io_end->count);
+	return io_end;
+}
+
+/* BIO completion function for page writeback */
+static void ext4_end_bio(struct bio *bio, int error)
+{
+	ext4_io_end_t *io_end = bio->bi_private;
+	sector_t bi_sector = bio->bi_iter.bi_sector;
+
+	BUG_ON(!io_end);
+	bio->bi_end_io = NULL;
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = 0;
+
+	if (error) {
+		struct inode *inode = io_end->inode;
+
+		ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
+			     "(offset %llu size %ld starting block %llu)",
+			     error, inode->i_ino,
+			     (unsigned long long) io_end->offset,
+			     (long) io_end->size,
+			     (unsigned long long)
+			     bi_sector >> (inode->i_blkbits - 9));
+		mapping_set_error(inode->i_mapping, error);
+	}
+
+	if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+		/*
+		 * Link bio into list hanging from io_end. We have to do it
+		 * atomically as bio completions can be racing against each
+		 * other.
+		 */
+		bio->bi_private = xchg(&io_end->bio, bio);
+		ext4_put_io_end_defer(io_end);
+	} else {
+		/*
+		 * Drop io_end reference early. Inode can get freed once
+		 * we finish the bio.
+		 */
+		ext4_put_io_end_defer(io_end);
+		ext4_finish_bio(bio);
+		bio_put(bio);
+	}
+}
+
+void ext4_io_submit(struct ext4_io_submit *io)
+{
+	struct bio *bio = io->io_bio;
+
+	if (bio) {
+		bio_get(io->io_bio);
+		submit_bio(io->io_op, io->io_bio);
+		BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
+		bio_put(io->io_bio);
+	}
+	io->io_bio = NULL;
+}
+
+void ext4_io_submit_init(struct ext4_io_submit *io,
+			 struct writeback_control *wbc)
+{
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
+	io->io_bio = NULL;
+	io->io_end = NULL;
+}
+
+static int io_submit_init_bio(struct ext4_io_submit *io,
+			      struct buffer_head *bh)
+{
+	int nvecs = bio_get_nr_vecs(bh->b_bdev);
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
+	if (!bio)
+		return -ENOMEM;
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	bio->bi_end_io = ext4_end_bio;
+	bio->bi_private = ext4_get_io_end(io->io_end);
+	io->io_bio = bio;
+	io->io_next_block = bh->b_blocknr;
+	return 0;
+}
+
+static int io_submit_add_bh(struct ext4_io_submit *io,
+			    struct inode *inode,
+			    struct buffer_head *bh)
+{
+	int ret;
+
+	if (io->io_bio && bh->b_blocknr != io->io_next_block) {
+submit_and_retry:
+		ext4_io_submit(io);
+	}
+	if (io->io_bio == NULL) {
+		ret = io_submit_init_bio(io, bh);
+		if (ret)
+			return ret;
+	}
+	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+	if (ret != bh->b_size)
+		goto submit_and_retry;
+	io->io_next_block++;
+	return 0;
+}
+
+int ext4_bio_write_page(struct ext4_io_submit *io,
+			struct page *page,
+			int len,
+			struct writeback_control *wbc,
+			bool keep_towrite)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned block_start, blocksize;
+	struct buffer_head *bh, *head;
+	int ret = 0;
+	int nr_submitted = 0;
+
+	blocksize = 1 << inode->i_blkbits;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(PageWriteback(page));
+
+	if (keep_towrite)
+		set_page_writeback_keepwrite(page);
+	else
+		set_page_writeback(page);
+	ClearPageError(page);
+
+	/*
+	 * Comments copied from block_write_full_page:
+	 *
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invocation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	if (len < PAGE_CACHE_SIZE)
+		zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	/*
+	 * In the first loop we prepare and mark buffers to submit. We have to
+	 * mark all buffers in the page before submitting so that
+	 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+	 * on the first buffer finishes and we are still working on submitting
+	 * the second buffer.
+	 */
+	bh = head = page_buffers(page);
+	do {
+		block_start = bh_offset(bh);
+		if (block_start >= len) {
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+			continue;
+		}
+		if (!buffer_dirty(bh) || buffer_delay(bh) ||
+		    !buffer_mapped(bh) || buffer_unwritten(bh)) {
+			/* A hole? We can safely clear the dirty bit */
+			if (!buffer_mapped(bh))
+				clear_buffer_dirty(bh);
+			if (io->io_bio)
+				ext4_io_submit(io);
+			continue;
+		}
+		if (buffer_new(bh)) {
+			clear_buffer_new(bh);
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+		}
+		set_buffer_async_write(bh);
+	} while ((bh = bh->b_this_page) != head);
+
+	/* Now submit buffers to write */
+	bh = head = page_buffers(page);
+	do {
+		if (!buffer_async_write(bh))
+			continue;
+		ret = io_submit_add_bh(io, inode, bh);
+		if (ret) {
+			/*
+			 * We only get here on ENOMEM.  Not much else
+			 * we can do but mark the page as dirty, and
+			 * better luck next time.
+			 */
+			redirty_page_for_writepage(wbc, page);
+			break;
+		}
+		nr_submitted++;
+		clear_buffer_dirty(bh);
+	} while ((bh = bh->b_this_page) != head);
+
+	/* Error stopped previous loop? Clean up buffers... */
+	if (ret) {
+		do {
+			clear_buffer_async_write(bh);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+	unlock_page(page);
+	/* Nothing submitted - we have to end page writeback */
+	if (!nr_submitted)
+		end_page_writeback(page);
+	return ret;
+}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e29efa0f9d6..bb0e80f03e2 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -11,12 +11,61 @@
 
 #define EXT4FS_DEBUG
 
-#include <linux/ext4_jbd2.h>
-
 #include <linux/errno.h>
 #include <linux/slab.h>
 
-#include "group.h"
+#include "ext4_jbd2.h"
+
+int ext4_resize_begin(struct super_block *sb)
+{
+	int ret = 0;
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	/*
+	 * We are not allowed to do online-resizing on a filesystem mounted
+	 * with error, because it can destroy the filesystem easily.
+	 */
+	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+		ext4_warning(sb, "There are errors in the filesystem, "
+			     "so online resizing is not allowed\n");
+		return -EPERM;
+	}
+
+	if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
+		ret = -EBUSY;
+
+	return ret;
+}
+
+void ext4_resize_end(struct super_block *sb)
+{
+	clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
+	smp_mb__after_atomic();
+}
+
+static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
+					     ext4_group_t group) {
+	return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) <<
+	       EXT4_DESC_PER_BLOCK_BITS(sb);
+}
+
+static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb,
+					     ext4_group_t group) {
+	group = ext4_meta_bg_first_group(sb, group);
+	return ext4_group_first_block_no(sb, group);
+}
+
+static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb,
+						ext4_group_t group) {
+	ext4_grpblk_t overhead;
+	overhead = ext4_bg_num_gdb(sb, group);
+	if (ext4_bg_has_super(sb, group))
+		overhead += 1 +
+			  le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+	return overhead;
+}
 
 #define outside(b, first, last)	((b) < (first) || (b) >= (last))
 #define inside(b, first, last)	((b) >= (first) && (b) < (last))
@@ -30,14 +79,20 @@ static int verify_group_input(struct super_block *sb,
 	ext4_fsblk_t end = start + input->blocks_count;
 	ext4_group_t group = input->group;
 	ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
-	unsigned overhead = ext4_bg_has_super(sb, group) ?
-		(1 + ext4_bg_num_gdb(sb, group) +
-		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
-	ext4_fsblk_t metaend = start + overhead;
+	unsigned overhead;
+	ext4_fsblk_t metaend;
 	struct buffer_head *bh = NULL;
 	ext4_grpblk_t free_blocks_count, offset;
 	int err = -EINVAL;
 
+	if (group != sbi->s_groups_count) {
+		ext4_warning(sb, "Cannot add at group %u (only %u groups)",
+			     input->group, sbi->s_groups_count);
+		return -EINVAL;
+	}
+
+	overhead = ext4_group_overhead_blocks(sb, group);
+	metaend = start + overhead;
 	input->free_blocks_count = free_blocks_count =
 		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
 
@@ -49,66 +104,52 @@ static int verify_group_input(struct super_block *sb,
 		       free_blocks_count, input->reserved_blocks);
 
 	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
-	if (group != sbi->s_groups_count)
-		ext4_warning(sb, __FUNCTION__,
-			     "Cannot add at group %u (only %lu groups)",
-			     input->group, sbi->s_groups_count);
-	else if (offset != 0)
-			ext4_warning(sb, __FUNCTION__, "Last group not full");
+	if (offset != 0)
+			ext4_warning(sb, "Last group not full");
 	else if (input->reserved_blocks > input->blocks_count / 5)
-		ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
+		ext4_warning(sb, "Reserved blocks too high (%u)",
 			     input->reserved_blocks);
 	else if (free_blocks_count < 0)
-		ext4_warning(sb, __FUNCTION__, "Bad blocks count %u",
+		ext4_warning(sb, "Bad blocks count %u",
 			     input->blocks_count);
 	else if (!(bh = sb_bread(sb, end - 1)))
-		ext4_warning(sb, __FUNCTION__,
-			     "Cannot read last block (%llu)",
+		ext4_warning(sb, "Cannot read last block (%llu)",
 			     end - 1);
 	else if (outside(input->block_bitmap, start, end))
-		ext4_warning(sb, __FUNCTION__,
-			     "Block bitmap not in group (block %llu)",
+		ext4_warning(sb, "Block bitmap not in group (block %llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (outside(input->inode_bitmap, start, end))
-		ext4_warning(sb, __FUNCTION__,
-			     "Inode bitmap not in group (block %llu)",
+		ext4_warning(sb, "Inode bitmap not in group (block %llu)",
 			     (unsigned long long)input->inode_bitmap);
 	else if (outside(input->inode_table, start, end) ||
-	         outside(itend - 1, start, end))
-		ext4_warning(sb, __FUNCTION__,
-			     "Inode table not in group (blocks %llu-%llu)",
+		 outside(itend - 1, start, end))
+		ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)",
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (input->inode_bitmap == input->block_bitmap)
-		ext4_warning(sb, __FUNCTION__,
-			     "Block bitmap same as inode bitmap (%llu)",
+		ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (inside(input->block_bitmap, input->inode_table, itend))
-		ext4_warning(sb, __FUNCTION__,
-			     "Block bitmap (%llu) in inode table (%llu-%llu)",
+		ext4_warning(sb, "Block bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
 			     (unsigned long long)input->block_bitmap,
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (inside(input->inode_bitmap, input->inode_table, itend))
-		ext4_warning(sb, __FUNCTION__,
-			     "Inode bitmap (%llu) in inode table (%llu-%llu)",
+		ext4_warning(sb, "Inode bitmap (%llu) in inode table "
+			     "(%llu-%llu)",
 			     (unsigned long long)input->inode_bitmap,
 			     (unsigned long long)input->inode_table, itend - 1);
 	else if (inside(input->block_bitmap, start, metaend))
-		ext4_warning(sb, __FUNCTION__,
-			     "Block bitmap (%llu) in GDT table"
-			     " (%llu-%llu)",
+		ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)",
 			     (unsigned long long)input->block_bitmap,
 			     start, metaend - 1);
 	else if (inside(input->inode_bitmap, start, metaend))
-		ext4_warning(sb, __FUNCTION__,
-			     "Inode bitmap (%llu) in GDT table"
-			     " (%llu-%llu)",
+		ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)",
 			     (unsigned long long)input->inode_bitmap,
 			     start, metaend - 1);
 	else if (inside(input->inode_table, start, metaend) ||
-	         inside(itend - 1, start, metaend))
-		ext4_warning(sb, __FUNCTION__,
-			     "Inode table (%llu-%llu) overlaps"
-			     "GDT table (%llu-%llu)",
+		 inside(itend - 1, start, metaend))
+		ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table "
+			     "(%llu-%llu)",
 			     (unsigned long long)input->inode_table,
 			     itend - 1, start, metaend - 1);
 	else
@@ -118,6 +159,186 @@ static int verify_group_input(struct super_block *sb,
 	return err;
 }
 
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+	struct ext4_new_group_data *groups;	/* new_group_data for groups
+						   in the flex group */
+	__u16 *bg_flags;			/* block group flags of groups
+						   in @groups */
+	ext4_group_t count;			/* number of groups in @groups
+						 */
+};
+
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+	struct ext4_new_flex_group_data *flex_gd;
+
+	flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+	if (flex_gd == NULL)
+		goto out3;
+
+	if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+		goto out2;
+	flex_gd->count = flexbg_size;
+
+	flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+				  flexbg_size, GFP_NOFS);
+	if (flex_gd->groups == NULL)
+		goto out2;
+
+	flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+	if (flex_gd->bg_flags == NULL)
+		goto out1;
+
+	return flex_gd;
+
+out1:
+	kfree(flex_gd->groups);
+out2:
+	kfree(flex_gd);
+out3:
+	return NULL;
+}
+
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+	kfree(flex_gd->bg_flags);
+	kfree(flex_gd->groups);
+	kfree(flex_gd);
+}
+
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize.  Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ *
+ * Returns 0 on a successful allocation of the metadata blocks in the
+ * block group.
+ */
+static int ext4_alloc_group_tables(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd,
+				int flexbg_size)
+{
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	ext4_fsblk_t start_blk;
+	ext4_fsblk_t last_blk;
+	ext4_group_t src_group;
+	ext4_group_t bb_index = 0;
+	ext4_group_t ib_index = 0;
+	ext4_group_t it_index = 0;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	unsigned overhead;
+	__u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
+
+	src_group = group_data[0].group;
+	last_group  = src_group + flex_gd->count - 1;
+
+	BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+	       (last_group & ~(flexbg_size - 1))));
+next_group:
+	group = group_data[0].group;
+	if (src_group >= group_data[0].group + flex_gd->count)
+		return -ENOSPC;
+	start_blk = ext4_group_first_block_no(sb, src_group);
+	last_blk = start_blk + group_data[src_group - group].blocks_count;
+
+	overhead = ext4_group_overhead_blocks(sb, src_group);
+
+	start_blk += overhead;
+
+	/* We collect contiguous blocks as much as possible. */
+	src_group++;
+	for (; src_group <= last_group; src_group++) {
+		overhead = ext4_group_overhead_blocks(sb, src_group);
+		if (overhead == 0)
+			last_blk += group_data[src_group - group].blocks_count;
+		else
+			break;
+	}
+
+	/* Allocate block bitmaps */
+	for (; bb_index < flex_gd->count; bb_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[bb_index].block_bitmap = start_blk++;
+		group = ext4_get_group_number(sb, start_blk - 1);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		flex_gd->bg_flags[group] &= uninit_mask;
+	}
+
+	/* Allocate inode bitmaps */
+	for (; ib_index < flex_gd->count; ib_index++) {
+		if (start_blk >= last_blk)
+			goto next_group;
+		group_data[ib_index].inode_bitmap = start_blk++;
+		group = ext4_get_group_number(sb, start_blk - 1);
+		group -= group_data[0].group;
+		group_data[group].free_blocks_count--;
+		flex_gd->bg_flags[group] &= uninit_mask;
+	}
+
+	/* Allocate inode tables */
+	for (; it_index < flex_gd->count; it_index++) {
+		unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
+		ext4_fsblk_t next_group_start;
+
+		if (start_blk + itb > last_blk)
+			goto next_group;
+		group_data[it_index].inode_table = start_blk;
+		group = ext4_get_group_number(sb, start_blk);
+		next_group_start = ext4_group_first_block_no(sb, group + 1);
+		group -= group_data[0].group;
+
+		if (start_blk + itb > next_group_start) {
+			flex_gd->bg_flags[group + 1] &= uninit_mask;
+			overhead = start_blk + itb - next_group_start;
+			group_data[group + 1].free_blocks_count -= overhead;
+			itb -= overhead;
+		}
+
+		group_data[group].free_blocks_count -= itb;
+		flex_gd->bg_flags[group] &= uninit_mask;
+		start_blk += EXT4_SB(sb)->s_itb_per_group;
+	}
+
+	if (test_opt(sb, DEBUG)) {
+		int i;
+		group = group_data[0].group;
+
+		printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+		       "%d groups, flexbg size is %d:\n", flex_gd->count,
+		       flexbg_size);
+
+		for (i = 0; i < flex_gd->count; i++) {
+			printk(KERN_DEBUG "adding %s group %u: %u "
+			       "blocks (%d free)\n",
+			       ext4_bg_has_super(sb, group + i) ? "normal" :
+			       "no-super", group + i,
+			       group_data[i].blocks_count,
+			       group_data[i].free_blocks_count);
+		}
+	}
+	return 0;
+}
+
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 				  ext4_fsblk_t blk)
 {
@@ -125,16 +346,15 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
 	int err;
 
 	bh = sb_getblk(sb, blk);
-	if (!bh)
-		return ERR_PTR(-EIO);
+	if (unlikely(!bh))
+		return ERR_PTR(-ENOMEM);
+	BUFFER_TRACE(bh, "get_write_access");
 	if ((err = ext4_journal_get_write_access(handle, bh))) {
 		brelse(bh);
 		bh = ERR_PTR(err);
 	} else {
-		lock_buffer(bh);
 		memset(bh->b_data, 0, sb->s_blocksize);
 		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
 	}
 
 	return bh;
@@ -145,168 +365,288 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
  * If that fails, restart the transaction & regain write access for the
  * buffer head which is used for block_bitmap modifications.
  */
-static int extend_or_restart_transaction(handle_t *handle, int thresh,
-					 struct buffer_head *bh)
+static int extend_or_restart_transaction(handle_t *handle, int thresh)
 {
 	int err;
 
-	if (handle->h_buffer_credits >= thresh)
+	if (ext4_handle_has_enough_credits(handle, thresh))
 		return 0;
 
 	err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
 	if (err < 0)
 		return err;
 	if (err) {
-		if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+		err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/*
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+			struct ext4_new_flex_group_data *flex_gd,
+			ext4_fsblk_t block, ext4_group_t count)
+{
+	ext4_group_t count2;
+
+	ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+	for (count2 = count; count > 0; count -= count2, block += count2) {
+		ext4_fsblk_t start;
+		struct buffer_head *bh;
+		ext4_group_t group;
+		int err;
+
+		group = ext4_get_group_number(sb, block);
+		start = ext4_group_first_block_no(sb, group);
+		group -= flex_gd->groups[0].group;
+
+		count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
+		if (count2 > count)
+			count2 = count;
+
+		if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+			BUG_ON(flex_gd->count > 1);
+			continue;
+		}
+
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			return err;
+
+		bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+		if (unlikely(!bh))
+			return -ENOMEM;
+
+		BUFFER_TRACE(bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
 			return err;
-	        if ((err = ext4_journal_get_write_access(handle, bh)))
+		ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+			   block - start, count2);
+		ext4_set_bits(bh->b_data, block - start, count2);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (unlikely(err))
 			return err;
-        }
+		brelse(bh);
+	}
 
 	return 0;
 }
 
 /*
- * Set up the block and inode bitmaps, and the inode table for the new group.
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
  * This doesn't need to be part of the main transaction, since we are only
  * changing blocks outside the actual filesystem.  We still do journaling to
  * ensure the recovery is correct in case of a failure just after resize.
  * If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ *  1. copy super block and GDT, and initialize group tables if necessary.
+ *     In this step, we only set bits in blocks bitmaps for blocks taken by
+ *     super block and GDT.
+ *  2. allocate group tables in block bitmaps, that is, set bits in block
+ *     bitmap for blocks taken by group tables.
  */
-static int setup_new_group_blocks(struct super_block *sb,
-				  struct ext4_new_group_data *input)
+static int setup_new_flex_group_blocks(struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
 {
+	int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+	ext4_fsblk_t start;
+	ext4_fsblk_t block;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
-	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
-		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
-	unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
-	struct buffer_head *bh;
+	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	__u16 *bg_flags = flex_gd->bg_flags;
 	handle_t *handle;
-	ext4_fsblk_t block;
-	ext4_grpblk_t bit;
-	int i;
-	int err = 0, err2;
+	ext4_group_t group, count;
+	struct buffer_head *bh = NULL;
+	int reserved_gdb, i, j, err = 0, err2;
+	int meta_bg;
 
-	/* This transaction may be extended/restarted along the way */
-	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+	BUG_ON(!flex_gd->count || !group_data ||
+	       group_data[0].group != sbi->s_groups_count);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
 
+	/* This transaction may be extended/restarted along the way */
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
-	lock_super(sb);
-	if (input->group != sbi->s_groups_count) {
-		err = -EBUSY;
-		goto exit_journal;
-	}
-
-	if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
-	}
-
-	if (ext4_bg_has_super(sb, input->group)) {
-		ext4_debug("mark backup superblock %#04llx (+0)\n", start);
-		ext4_set_bit(0, bh->b_data);
-	}
+	group = group_data[0].group;
+	for (i = 0; i < flex_gd->count; i++, group++) {
+		unsigned long gdblocks;
+		ext4_grpblk_t overhead;
 
-	/* Copy all of the GDT blocks into the backup in this group */
-	for (i = 0, bit = 1, block = start + 1;
-	     i < gdblocks; i++, block++, bit++) {
-		struct buffer_head *gdb;
+		gdblocks = ext4_bg_num_gdb(sb, group);
+		start = ext4_group_first_block_no(sb, group);
 
-		ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
+		if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
+			goto handle_itb;
 
-		if ((err = extend_or_restart_transaction(handle, 1, bh)))
-			goto exit_bh;
-
-		gdb = sb_getblk(sb, block);
-		if (!gdb) {
-			err = -EIO;
-			goto exit_bh;
+		if (meta_bg == 1) {
+			ext4_group_t first_group;
+			first_group = ext4_meta_bg_first_group(sb, group);
+			if (first_group != group + 1 &&
+			    first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
+				goto handle_itb;
 		}
-		if ((err = ext4_journal_get_write_access(handle, gdb))) {
+
+		block = start + ext4_bg_has_super(sb, group);
+		/* Copy all of the GDT blocks into the backup in this group */
+		for (j = 0; j < gdblocks; j++, block++) {
+			struct buffer_head *gdb;
+
+			ext4_debug("update backup group %#04llx\n", block);
+			err = extend_or_restart_transaction(handle, 1);
+			if (err)
+				goto out;
+
+			gdb = sb_getblk(sb, block);
+			if (unlikely(!gdb)) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			BUFFER_TRACE(gdb, "get_write_access");
+			err = ext4_journal_get_write_access(handle, gdb);
+			if (err) {
+				brelse(gdb);
+				goto out;
+			}
+			memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+			       gdb->b_size);
+			set_buffer_uptodate(gdb);
+
+			err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+			if (unlikely(err)) {
+				brelse(gdb);
+				goto out;
+			}
 			brelse(gdb);
-			goto exit_bh;
 		}
-		lock_buffer(gdb);
-		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
-		set_buffer_uptodate(gdb);
-		unlock_buffer(gdb);
-		ext4_journal_dirty_metadata(handle, gdb);
-		ext4_set_bit(bit, bh->b_data);
-		brelse(gdb);
-	}
 
-	/* Zero out all of the reserved backup group descriptor table blocks */
-	for (i = 0, bit = gdblocks + 1, block = start + bit;
-	     i < reserved_gdb; i++, block++, bit++) {
-		struct buffer_head *gdb;
-
-		ext4_debug("clear reserved block %#04llx (+%d)\n", block, bit);
-
-		if ((err = extend_or_restart_transaction(handle, 1, bh)))
-			goto exit_bh;
+		/* Zero out all of the reserved backup group descriptor
+		 * table blocks
+		 */
+		if (ext4_bg_has_super(sb, group)) {
+			err = sb_issue_zeroout(sb, gdblocks + start + 1,
+					reserved_gdb, GFP_NOFS);
+			if (err)
+				goto out;
+		}
 
-		if (IS_ERR(gdb = bclean(handle, sb, block))) {
+handle_itb:
+		/* Initialize group tables of the grop @group */
+		if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
+			goto handle_bb;
+
+		/* Zero out all of the inode table blocks */
+		block = group_data[i].inode_table;
+		ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+			   block, sbi->s_itb_per_group);
+		err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+				       GFP_NOFS);
+		if (err)
+			goto out;
+
+handle_bb:
+		if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
+			goto handle_ib;
+
+		/* Initialize block bitmap of the @group */
+		block = group_data[i].block_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
 			err = PTR_ERR(bh);
-			goto exit_bh;
+			goto out;
 		}
-		ext4_journal_dirty_metadata(handle, gdb);
-		ext4_set_bit(bit, bh->b_data);
-		brelse(gdb);
-	}
-	ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
-		   input->block_bitmap - start);
-	ext4_set_bit(input->block_bitmap - start, bh->b_data);
-	ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
-		   input->inode_bitmap - start);
-	ext4_set_bit(input->inode_bitmap - start, bh->b_data);
-
-	/* Zero out all of the inode table blocks */
-	for (i = 0, block = input->inode_table, bit = block - start;
-	     i < sbi->s_itb_per_group; i++, bit++, block++) {
-		struct buffer_head *it;
-
-		ext4_debug("clear inode block %#04llx (+%d)\n", block, bit);
-
-		if ((err = extend_or_restart_transaction(handle, 1, bh)))
-			goto exit_bh;
-
-		if (IS_ERR(it = bclean(handle, sb, block))) {
-			err = PTR_ERR(it);
-			goto exit_bh;
+		overhead = ext4_group_overhead_blocks(sb, group);
+		if (overhead != 0) {
+			ext4_debug("mark backup superblock %#04llx (+0)\n",
+				   start);
+			ext4_set_bits(bh->b_data, 0, overhead);
 		}
-		ext4_journal_dirty_metadata(handle, it);
-		brelse(it);
-		ext4_set_bit(bit, bh->b_data);
-	}
+		ext4_mark_bitmap_end(group_data[i].blocks_count,
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
 
-	if ((err = extend_or_restart_transaction(handle, 2, bh)))
-		goto exit_bh;
+handle_ib:
+		if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+			continue;
+
+		/* Initialize inode bitmap of the @group */
+		block = group_data[i].inode_bitmap;
+		err = extend_or_restart_transaction(handle, 1);
+		if (err)
+			goto out;
+		/* Mark unused entries in inode bitmap used */
+		bh = bclean(handle, sb, block);
+		if (IS_ERR(bh)) {
+			err = PTR_ERR(bh);
+			goto out;
+		}
 
-	mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
-			bh->b_data);
-	ext4_journal_dirty_metadata(handle, bh);
-	brelse(bh);
+		ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+				     sb->s_blocksize * 8, bh->b_data);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (err)
+			goto out;
+		brelse(bh);
+	}
+	bh = NULL;
+
+	/* Mark group tables in block bitmap */
+	for (j = 0; j < GROUP_TABLE_COUNT; j++) {
+		count = group_table_count[j];
+		start = (&group_data[0].block_bitmap)[j];
+		block = start;
+		for (i = 1; i < flex_gd->count; i++) {
+			block += group_table_count[j];
+			if (block == (&group_data[i].block_bitmap)[j]) {
+				count += group_table_count[j];
+				continue;
+			}
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+			count = group_table_count[j];
+			start = (&group_data[i].block_bitmap)[j];
+			block = start;
+		}
 
-	/* Mark unused entries in inode bitmap used */
-	ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
-		   input->inode_bitmap, input->inode_bitmap - start);
-	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
-		err = PTR_ERR(bh);
-		goto exit_journal;
+		if (count) {
+			err = set_flexbg_block_bitmap(sb, handle,
+						flex_gd, start, count);
+			if (err)
+				goto out;
+		}
 	}
 
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
-			bh->b_data);
-	ext4_journal_dirty_metadata(handle, bh);
-exit_bh:
+out:
 	brelse(bh);
-
-exit_journal:
-	unlock_super(sb);
-	if ((err2 = ext4_journal_stop(handle)) && !err)
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
 		err = err2;
 
 	return err;
@@ -354,10 +694,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
  * groups in current filesystem that have BACKUPS, or -ve error code.
  */
 static int verify_reserved_gdb(struct super_block *sb,
+			       ext4_group_t end,
 			       struct buffer_head *primary)
 {
 	const ext4_fsblk_t blk = primary->b_blocknr;
-	const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
 	unsigned three = 1;
 	unsigned five = 5;
 	unsigned seven = 7;
@@ -368,8 +708,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) !=
 		    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
-			ext4_warning(sb, __FUNCTION__,
-				     "reserved GDT %llu"
+			ext4_warning(sb, "reserved GDT %llu"
 				     " missing grp %d (%llu)",
 				     blk, grp,
 				     grp *
@@ -398,15 +737,15 @@ static int verify_reserved_gdb(struct super_block *sb,
  * fail once we start modifying the data on disk, because JBD has no rollback.
  */
 static int add_new_gdb(handle_t *handle, struct inode *inode,
-		       struct ext4_new_group_data *input,
-		       struct buffer_head **primary)
+		       ext4_group_t group)
 {
 	struct super_block *sb = inode->i_sb;
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-	unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
 	ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
 	struct buffer_head **o_group_desc, **n_group_desc;
 	struct buffer_head *dind;
+	struct buffer_head *gdb_bh;
 	int gdbackups;
 	struct ext4_iloc iloc;
 	__le32 *data;
@@ -419,22 +758,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 
 	/*
 	 * If we are not using the primary superblock/GDT copy don't resize,
-	 * because the user tools have no way of handling this.  Probably a
-	 * bad time to do it anyways.
-	 */
+         * because the user tools have no way of handling this.  Probably a
+         * bad time to do it anyways.
+         */
 	if (EXT4_SB(sb)->s_sbh->b_blocknr !=
 	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
-		ext4_warning(sb, __FUNCTION__,
-			"won't resize using backup superblock at %llu",
+		ext4_warning(sb, "won't resize using backup superblock at %llu",
 			(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
 		return -EPERM;
 	}
 
-	*primary = sb_bread(sb, gdblock);
-	if (!*primary)
+	gdb_bh = sb_bread(sb, gdblock);
+	if (!gdb_bh)
 		return -EIO;
 
-	if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+	gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
+	if (gdbackups < 0) {
 		err = gdbackups;
 		goto exit_bh;
 	}
@@ -448,32 +787,39 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 
 	data = (__le32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
-		ext4_warning(sb, __FUNCTION__,
-			     "new group %u GDT block %llu not reserved",
-			     input->group, gdblock);
+		ext4_warning(sb, "new group %u GDT block %llu not reserved",
+			     group, gdblock);
 		err = -EINVAL;
 		goto exit_dind;
 	}
 
-	if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (unlikely(err))
 		goto exit_dind;
 
-	if ((err = ext4_journal_get_write_access(handle, *primary)))
-		goto exit_sbh;
+	BUFFER_TRACE(gdb_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gdb_bh);
+	if (unlikely(err))
+		goto exit_dind;
 
-	if ((err = ext4_journal_get_write_access(handle, dind)))
-		goto exit_primary;
+	BUFFER_TRACE(dind, "get_write_access");
+	err = ext4_journal_get_write_access(handle, dind);
+	if (unlikely(err))
+		ext4_std_error(sb, err);
 
 	/* ext4_reserve_inode_write() gets a reference on the iloc */
-	if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
-		goto exit_dindj;
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (unlikely(err))
+		goto exit_dind;
 
-	n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
-			GFP_KERNEL);
+	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+				     sizeof(struct buffer_head *),
+				     GFP_NOFS);
 	if (!n_group_desc) {
 		err = -ENOMEM;
-		ext4_warning (sb, __FUNCTION__,
-			      "not enough memory for %lu groups", gdb_num + 1);
+		ext4_warning(sb, "not enough memory for %lu groups",
+			     gdb_num + 1);
 		goto exit_inode;
 	}
 
@@ -487,46 +833,89 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	 * reserved inode, and will become GDT blocks (primary and backup).
 	 */
 	data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-	ext4_journal_dirty_metadata(handle, dind);
-	brelse(dind);
+	err = ext4_handle_dirty_metadata(handle, NULL, dind);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_inode;
+	}
 	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
 	ext4_mark_iloc_dirty(handle, inode, &iloc);
-	memset((*primary)->b_data, 0, sb->s_blocksize);
-	ext4_journal_dirty_metadata(handle, *primary);
+	memset(gdb_bh->b_data, 0, sb->s_blocksize);
+	err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+	if (unlikely(err)) {
+		ext4_std_error(sb, err);
+		goto exit_inode;
+	}
+	brelse(dind);
 
 	o_group_desc = EXT4_SB(sb)->s_group_desc;
 	memcpy(n_group_desc, o_group_desc,
 	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
-	n_group_desc[gdb_num] = *primary;
+	n_group_desc[gdb_num] = gdb_bh;
 	EXT4_SB(sb)->s_group_desc = n_group_desc;
 	EXT4_SB(sb)->s_gdb_count++;
-	kfree(o_group_desc);
+	ext4_kvfree(o_group_desc);
 
-	es->s_reserved_gdt_blocks =
-		cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
-	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
+	err = ext4_handle_dirty_super(handle, sb);
+	if (err)
+		ext4_std_error(sb, err);
 
-	return 0;
+	return err;
 
 exit_inode:
-	//ext4_journal_release_buffer(handle, iloc.bh);
+	ext4_kvfree(n_group_desc);
 	brelse(iloc.bh);
-exit_dindj:
-	//ext4_journal_release_buffer(handle, dind);
-exit_primary:
-	//ext4_journal_release_buffer(handle, *primary);
-exit_sbh:
-	//ext4_journal_release_buffer(handle, *primary);
 exit_dind:
 	brelse(dind);
 exit_bh:
-	brelse(*primary);
+	brelse(gdb_bh);
 
 	ext4_debug("leaving with error %d\n", err);
 	return err;
 }
 
 /*
+ * add_new_gdb_meta_bg is the sister of add_new_gdb.
+ */
+static int add_new_gdb_meta_bg(struct super_block *sb,
+			       handle_t *handle, ext4_group_t group) {
+	ext4_fsblk_t gdblock;
+	struct buffer_head *gdb_bh;
+	struct buffer_head **o_group_desc, **n_group_desc;
+	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+	int err;
+
+	gdblock = ext4_meta_bg_first_block_no(sb, group) +
+		   ext4_bg_has_super(sb, group);
+	gdb_bh = sb_bread(sb, gdblock);
+	if (!gdb_bh)
+		return -EIO;
+	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
+				     sizeof(struct buffer_head *),
+				     GFP_NOFS);
+	if (!n_group_desc) {
+		err = -ENOMEM;
+		ext4_warning(sb, "not enough memory for %lu groups",
+			     gdb_num + 1);
+		return err;
+	}
+
+	o_group_desc = EXT4_SB(sb)->s_group_desc;
+	memcpy(n_group_desc, o_group_desc,
+	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	n_group_desc[gdb_num] = gdb_bh;
+	EXT4_SB(sb)->s_group_desc = n_group_desc;
+	EXT4_SB(sb)->s_gdb_count++;
+	ext4_kvfree(o_group_desc);
+	BUFFER_TRACE(gdb_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gdb_bh);
+	if (unlikely(err))
+		brelse(gdb_bh);
+	return err;
+}
+
+/*
  * Called when we are adding a new group which has a backup copy of each of
  * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
  * We need to add these reserved backup GDT blocks to the resize inode, so
@@ -540,7 +929,7 @@ exit_bh:
  * backup GDT blocks are stored in their reserved primary GDT block.
  */
 static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
-			      struct ext4_new_group_data *input)
+			      ext4_group_t group)
 {
 	struct super_block *sb = inode->i_sb;
 	int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
@@ -553,7 +942,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	int res, i;
 	int err;
 
-	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
+	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS);
 	if (!primary)
 		return -ENOMEM;
 
@@ -565,14 +954,14 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	}
 
 	blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
-	data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count %
+					 EXT4_ADDR_PER_BLOCK(sb));
 	end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
 
 	/* Get each reserved primary GDT block and verify it holds backups */
 	for (res = 0; res < reserved_gdb; res++, blk++) {
 		if (le32_to_cpu(*data) != blk) {
-			ext4_warning(sb, __FUNCTION__,
-				     "reserved block %llu"
+			ext4_warning(sb, "reserved block %llu"
 				     " not at offset %ld",
 				     blk,
 				     (long)(data - (__le32 *)dind->b_data));
@@ -584,7 +973,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 			err = -EIO;
 			goto exit_bh;
 		}
-		if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+		gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+		if (gdbackups < 0) {
 			brelse(primary[res]);
 			err = gdbackups;
 			goto exit_bh;
@@ -594,14 +984,9 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	}
 
 	for (i = 0; i < reserved_gdb; i++) {
-		if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
-			/*
-			int j;
-			for (j = 0; j < i; j++)
-				ext4_journal_release_buffer(handle, primary[j]);
-			 */
+		BUFFER_TRACE(primary[i], "get_write_access");
+		if ((err = ext4_journal_get_write_access(handle, primary[i])))
 			goto exit_bh;
-		}
 	}
 
 	if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
@@ -611,7 +996,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	 * Finally we can add each of the reserved backup GDT blocks from
 	 * the new group to its reserved primary GDT block.
 	 */
-	blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
+	blk = group * EXT4_BLOCKS_PER_GROUP(sb);
 	for (i = 0; i < reserved_gdb; i++) {
 		int err2;
 		data = (__le32 *)primary[i]->b_data;
@@ -619,7 +1004,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 		       primary[i]->b_blocknr, gdbackups,
 		       blk + primary[i]->b_blocknr); */
 		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
-		err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+		err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
 		if (!err)
 			err = err2;
 	}
@@ -646,49 +1031,68 @@ exit_free:
  * important part is that the new block and inode counts are in the backup
  * superblocks, and the location of the new group metadata in the GDT backups.
  *
- * We do not need lock_super() for this, because these blocks are not
- * otherwise touched by the filesystem code when it is mounted.  We don't
- * need to worry about last changing from sbi->s_groups_count, because the
- * worst that can happen is that we do not copy the full number of backups
- * at this time.  The resize which changed s_groups_count will backup again.
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted.  We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
  */
-static void update_backups(struct super_block *sb,
-			   int blk_off, char *data, int size)
+static void update_backups(struct super_block *sb, int blk_off, char *data,
+			   int size, int meta_bg)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	const ext4_group_t last = sbi->s_groups_count;
+	ext4_group_t last;
 	const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
 	unsigned three = 1;
 	unsigned five = 5;
 	unsigned seven = 7;
-	ext4_group_t group;
+	ext4_group_t group = 0;
 	int rest = sb->s_blocksize - size;
 	handle_t *handle;
 	int err = 0, err2;
 
-	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
 	if (IS_ERR(handle)) {
 		group = 1;
 		err = PTR_ERR(handle);
 		goto exit_err;
 	}
 
-	while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+	if (meta_bg == 0) {
+		group = ext4_list_backups(sb, &three, &five, &seven);
+		last = sbi->s_groups_count;
+	} else {
+		group = ext4_meta_bg_first_group(sb, group) + 1;
+		last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
+	}
+
+	while (group < sbi->s_groups_count) {
 		struct buffer_head *bh;
+		ext4_fsblk_t backup_block;
 
 		/* Out of journal space, and can't get more - abort - so sad */
-		if (handle->h_buffer_credits == 0 &&
+		if (ext4_handle_valid(handle) &&
+		    handle->h_buffer_credits == 0 &&
 		    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
 		    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
 			break;
 
-		bh = sb_getblk(sb, group * bpg + blk_off);
-		if (!bh) {
-			err = -EIO;
+		if (meta_bg == 0)
+			backup_block = group * bpg + blk_off;
+		else
+			backup_block = (ext4_group_first_block_no(sb, group) +
+					ext4_bg_has_super(sb, group));
+
+		bh = sb_getblk(sb, backup_block);
+		if (unlikely(!bh)) {
+			err = -ENOMEM;
 			break;
 		}
-		ext4_debug("update metadata backup %#04lx\n",
-			  (unsigned long)bh->b_blocknr);
+		ext4_debug("update metadata backup %llu(+%llu)\n",
+			   backup_block, backup_block -
+			   ext4_group_first_block_no(sb, group));
+		BUFFER_TRACE(bh, "get_write_access");
 		if ((err = ext4_journal_get_write_access(handle, bh)))
 			break;
 		lock_buffer(bh);
@@ -697,8 +1101,17 @@ static void update_backups(struct super_block *sb,
 			memset(bh->b_data + size, 0, rest);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
-		ext4_journal_dirty_metadata(handle, bh);
+		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+		if (unlikely(err))
+			ext4_std_error(sb, err);
 		brelse(bh);
+
+		if (meta_bg == 0)
+			group = ext4_list_backups(sb, &three, &five, &seven);
+		else if (group == last)
+			break;
+		else
+			group = last;
 	}
 	if ((err2 = ext4_journal_stop(handle)) && !err)
 		err = err2;
@@ -715,8 +1128,7 @@ static void update_backups(struct super_block *sb,
 	 */
 exit_err:
 	if (err) {
-		ext4_warning(sb, __FUNCTION__,
-			     "can't update backup for group %lu (err %d), "
+		ext4_warning(sb, "can't update backup for group %u (err %d), "
 			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT4_VALID_FS;
 		sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -724,147 +1136,184 @@ exit_err:
 	}
 }
 
-/* Add group descriptor data to an existing or new group descriptor block.
- * Ensure we handle all possible error conditions _before_ we start modifying
- * the filesystem, because we cannot abort the transaction and not have it
- * write the data to disk.
- *
- * If we are on a GDT block boundary, we need to get the reserved GDT block.
- * Otherwise, we may need to add backup GDT blocks for a sparse group.
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
  *
- * We only need to hold the superblock lock while we are actually adding
- * in the new group's counts to the superblock.  Prior to that we have
- * not really "added" the group at all.  We re-check that we are still
- * adding in the last group in case things have changed since verifying.
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
  */
-int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+			      ext4_group_t group, struct inode *resize_inode,
+			      ext4_group_t count)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
-		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
-	struct buffer_head *primary = NULL;
-	struct ext4_group_desc *gdp;
-	struct inode *inode = NULL;
-	handle_t *handle;
-	int gdb_off, gdb_num;
-	int err, err2;
-
-	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
-	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
-
-	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
-		ext4_warning(sb, __FUNCTION__,
-			     "Can't resize non-sparse filesystem further");
-		return -EPERM;
-	}
-
-	if (ext4_blocks_count(es) + input->blocks_count <
-	    ext4_blocks_count(es)) {
-		ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n");
-		return -EINVAL;
-	}
-
-	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
-	    le32_to_cpu(es->s_inodes_count)) {
-		ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n");
-		return -EINVAL;
+	struct buffer_head *gdb_bh;
+	int i, gdb_off, gdb_num, err = 0;
+	int meta_bg;
+
+	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+	for (i = 0; i < count; i++, group++) {
+		int reserved_gdb = ext4_bg_has_super(sb, group) ?
+			le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * We will only either add reserved group blocks to a backup group
+		 * or remove reserved blocks for the first group in a new group block.
+		 * Doing both would be mean more complex code, and sane people don't
+		 * use non-sparse filesystems anymore.  This is already checked above.
+		 */
+		if (gdb_off) {
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			BUFFER_TRACE(gdb_bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle, gdb_bh);
+
+			if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+				err = reserve_backup_gdb(handle, resize_inode, group);
+		} else if (meta_bg != 0) {
+			err = add_new_gdb_meta_bg(sb, handle, group);
+		} else {
+			err = add_new_gdb(handle, resize_inode, group);
+		}
+		if (err)
+			break;
 	}
+	return err;
+}
 
-	if (reserved_gdb || gdb_off == 0) {
-		if (!EXT4_HAS_COMPAT_FEATURE(sb,
-					     EXT4_FEATURE_COMPAT_RESIZE_INODE)){
-			ext4_warning(sb, __FUNCTION__,
-				     "No reserved GDT blocks, can't resize");
-			return -EPERM;
-		}
-		inode = ext4_iget(sb, EXT4_RESIZE_INO);
-		if (IS_ERR(inode)) {
-			ext4_warning(sb, __FUNCTION__,
-				     "Error opening resize inode");
-			return PTR_ERR(inode);
+static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
+{
+	struct buffer_head *bh = sb_getblk(sb, block);
+	if (unlikely(!bh))
+		return NULL;
+	if (!bh_uptodate_or_lock(bh)) {
+		if (bh_submit_read(bh) < 0) {
+			brelse(bh);
+			return NULL;
 		}
 	}
 
-	if ((err = verify_group_input(sb, input)))
-		goto exit_put;
+	return bh;
+}
 
-	if ((err = setup_new_group_blocks(sb, input)))
-		goto exit_put;
+static int ext4_set_bitmap_checksums(struct super_block *sb,
+				     ext4_group_t group,
+				     struct ext4_group_desc *gdp,
+				     struct ext4_new_group_data *group_data)
+{
+	struct buffer_head *bh;
 
-	/*
-	 * We will always be modifying at least the superblock and a GDT
-	 * block.  If we are adding a group past the last current GDT block,
-	 * we will also modify the inode and the dindirect block.  If we
-	 * are adding a group with superblock/GDT backups  we will also
-	 * modify each of the reserved GDT dindirect blocks.
-	 */
-	handle = ext4_journal_start_sb(sb,
-				       ext4_bg_has_super(sb, input->group) ?
-				       3 + reserved_gdb : 4);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		goto exit_put;
-	}
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 0;
 
-	lock_super(sb);
-	if (input->group != sbi->s_groups_count) {
-		ext4_warning(sb, __FUNCTION__,
-			     "multiple resizers run on filesystem!");
-		err = -EBUSY;
-		goto exit_journal;
-	}
+	bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
+	if (!bh)
+		return -EIO;
+	ext4_inode_bitmap_csum_set(sb, group, gdp, bh,
+				   EXT4_INODES_PER_GROUP(sb) / 8);
+	brelse(bh);
 
-	if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
-		goto exit_journal;
+	bh = ext4_get_bitmap(sb, group_data->block_bitmap);
+	if (!bh)
+		return -EIO;
+	ext4_block_bitmap_csum_set(sb, group, gdp, bh);
+	brelse(bh);
 
-	/*
-	 * We will only either add reserved group blocks to a backup group
-	 * or remove reserved blocks for the first group in a new group block.
-	 * Doing both would be mean more complex code, and sane people don't
-	 * use non-sparse filesystems anymore.  This is already checked above.
-	 */
-	if (gdb_off) {
-		primary = sbi->s_group_desc[gdb_num];
-		if ((err = ext4_journal_get_write_access(handle, primary)))
-			goto exit_journal;
-
-		if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
-		    (err = reserve_backup_gdb(handle, inode, input)))
-			goto exit_journal;
-	} else if ((err = add_new_gdb(handle, inode, input, &primary)))
-		goto exit_journal;
+	return 0;
+}
 
-	/*
-	 * OK, now we've set up the new group.  Time to make it active.
-	 *
-	 * Current kernels don't lock all allocations via lock_super(),
-	 * so we have to be safe wrt. concurrent accesses the group
-	 * data.  So we need to be careful to set all of the relevant
-	 * group descriptor data etc. *before* we enable the group.
-	 *
-	 * The key field here is sbi->s_groups_count: as long as
-	 * that retains its old value, nobody is going to access the new
-	 * group.
-	 *
-	 * So first we update all the descriptor metadata for the new
-	 * group; then we update the total disk blocks count; then we
-	 * update the groups count to enable the group; then finally we
-	 * update the free space counts so that the system can start
-	 * using the new disk blocks.
-	 */
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+				struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_new_group_data	*group_data = flex_gd->groups;
+	struct ext4_group_desc		*gdp;
+	struct ext4_sb_info		*sbi = EXT4_SB(sb);
+	struct buffer_head		*gdb_bh;
+	ext4_group_t			group;
+	__u16				*bg_flags = flex_gd->bg_flags;
+	int				i, gdb_off, gdb_num, err = 0;
+	
+
+	for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+		group = group_data->group;
+
+		gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+		gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+
+		/*
+		 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+		 */
+		gdb_bh = sbi->s_group_desc[gdb_num];
+		/* Update group descriptor block for new group */
+		gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
+						 gdb_off * EXT4_DESC_SIZE(sb));
+
+		memset(gdp, 0, EXT4_DESC_SIZE(sb));
+		ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+		ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+		err = ext4_set_bitmap_checksums(sb, group, gdp, group_data);
+		if (err) {
+			ext4_std_error(sb, err);
+			break;
+		}
 
-	/* Update group descriptor block for new group */
-	gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
+		ext4_inode_table_set(sb, gdp, group_data->inode_table);
+		ext4_free_group_clusters_set(sb, gdp,
+			EXT4_NUM_B2C(sbi, group_data->free_blocks_count));
+		ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+		if (ext4_has_group_desc_csum(sb))
+			ext4_itable_unused_set(sb, gdp,
+					       EXT4_INODES_PER_GROUP(sb));
+		gdp->bg_flags = cpu_to_le16(*bg_flags);
+		ext4_group_desc_csum_set(sb, group, gdp);
+
+		err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+		if (unlikely(err)) {
+			ext4_std_error(sb, err);
+			break;
+		}
 
-	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
-	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
-	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-	gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
-	gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
+		/*
+		 * We can allocate memory for mb_alloc based on the new group
+		 * descriptor
+		 */
+		err = ext4_mb_add_groupinfo(sb, group, gdp);
+		if (err)
+			break;
+	}
+	return err;
+}
 
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+			     struct ext4_new_flex_group_data *flex_gd)
+{
+	ext4_fsblk_t blocks_count = 0;
+	ext4_fsblk_t free_blocks = 0;
+	ext4_fsblk_t reserved_blocks = 0;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int i;
+
+	BUG_ON(flex_gd->count == 0 || group_data == NULL);
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
 	 * increasing the group count so that once the group is enabled,
@@ -875,26 +1324,35 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 * blocks/inodes before the group is live won't actually let us
 	 * allocate the new space yet.
 	 */
-	ext4_blocks_count_set(es, ext4_blocks_count(es) +
-		input->blocks_count);
-	es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
-		EXT4_INODES_PER_GROUP(sb));
+	for (i = 0; i < flex_gd->count; i++) {
+		blocks_count += group_data[i].blocks_count;
+		free_blocks += group_data[i].free_blocks_count;
+	}
 
+	reserved_blocks = ext4_r_blocks_count(es) * 100;
+	reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es));
+	reserved_blocks *= blocks_count;
+	do_div(reserved_blocks, 100);
+
+	ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+	ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
+	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+		     flex_gd->count);
+	le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+		     flex_gd->count);
+
+	ext4_debug("free blocks count %llu", ext4_free_blocks_count(es));
 	/*
 	 * We need to protect s_groups_count against other CPUs seeing
 	 * inconsistent state in the superblock.
 	 *
 	 * The precise rules we use are:
 	 *
-	 * * Writers of s_groups_count *must* hold lock_super
-	 * AND
-	 * * Writers must perform a smp_wmb() after updating all dependent
-	 *   data and before modifying the groups count
+	 * * Writers must perform a smp_wmb() after updating all
+	 *   dependent data and before modifying the groups count
 	 *
-	 * * Readers must hold lock_super() over the access
-	 * OR
-	 * * Readers must perform an smp_rmb() after reading the groups count
-	 *   and before reading any dependent data.
+	 * * Readers must perform an smp_rmb() after reading the groups
+	 *   count and before reading any dependent data.
 	 *
 	 * NB. These rules can be relaxed when checking the group count
 	 * while freeing data, as we can only allocate from a block
@@ -905,40 +1363,334 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	smp_wmb();
 
 	/* Update the global fs size fields */
-	sbi->s_groups_count++;
-
-	ext4_journal_dirty_metadata(handle, primary);
+	sbi->s_groups_count += flex_gd->count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 
 	/* Update the reserved block counts only once the new group is
 	 * active. */
 	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
-		input->reserved_blocks);
+				reserved_blocks);
 
 	/* Update the free space counts */
-	percpu_counter_add(&sbi->s_freeblocks_counter,
-			   input->free_blocks_count);
+	percpu_counter_add(&sbi->s_freeclusters_counter,
+			   EXT4_NUM_B2C(sbi, free_blocks));
 	percpu_counter_add(&sbi->s_freeinodes_counter,
-			   EXT4_INODES_PER_GROUP(sb));
+			   EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+
+	ext4_debug("free blocks count %llu",
+		   percpu_counter_read(&sbi->s_freeclusters_counter));
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+				      EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group;
+		flex_group = ext4_flex_group(sbi, group_data[0].group);
+		atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+	}
 
-	ext4_journal_dirty_metadata(handle, sbi->s_sbh);
-	sb->s_dirt = 1;
+	/*
+	 * Update the fs overhead information
+	 */
+	ext4_calculate_overhead(sb);
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: added group %u:"
+		       "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+		       blocks_count, free_blocks, reserved_blocks);
+}
+
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+			       struct inode *resize_inode,
+			       struct ext4_new_flex_group_data *flex_gd)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t o_blocks_count;
+	ext4_grpblk_t last;
+	ext4_group_t group;
+	handle_t *handle;
+	unsigned reserved_gdb;
+	int err = 0, err2 = 0, credit;
+
+	BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+
+	reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+	o_blocks_count = ext4_blocks_count(es);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+
+	err = setup_new_flex_group_blocks(sb, flex_gd);
+	if (err)
+		goto exit;
+	/*
+	 * We will always be modifying at least the superblock and  GDT
+	 * block.  If we are adding a group past the last current GDT block,
+	 * we will also modify the inode and the dindirect block.  If we
+	 * are adding a group with superblock/GDT backups  we will also
+	 * modify each of the reserved GDT dindirect blocks.
+	 */
+	credit = flex_gd->count * 4 + reserved_gdb;
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit;
+	}
+
+	BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+	if (err)
+		goto exit_journal;
+
+	group = flex_gd->groups[0].group;
+	BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+	err = ext4_add_new_descs(handle, sb, group,
+				resize_inode, flex_gd->count);
+	if (err)
+		goto exit_journal;
+
+	err = ext4_setup_new_descs(handle, sb, flex_gd);
+	if (err)
+		goto exit_journal;
+
+	ext4_update_super(sb, flex_gd);
+
+	err = ext4_handle_dirty_super(handle, sb);
 
 exit_journal:
-	unlock_super(sb);
-	if ((err2 = ext4_journal_stop(handle)) && !err)
+	err2 = ext4_journal_stop(handle);
+	if (!err)
 		err = err2;
+
 	if (!err) {
+		int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+		int gdb_num_end = ((group + flex_gd->count - 1) /
+				   EXT4_DESC_PER_BLOCK(sb));
+		int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
+				EXT4_FEATURE_INCOMPAT_META_BG);
+		sector_t old_gdb = 0;
+
 		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
-			       sizeof(struct ext4_super_block));
-		update_backups(sb, primary->b_blocknr, primary->b_data,
-			       primary->b_size);
+			       sizeof(struct ext4_super_block), 0);
+		for (; gdb_num <= gdb_num_end; gdb_num++) {
+			struct buffer_head *gdb_bh;
+
+			gdb_bh = sbi->s_group_desc[gdb_num];
+			if (old_gdb == gdb_bh->b_blocknr)
+				continue;
+			update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+				       gdb_bh->b_size, meta_bg);
+			old_gdb = gdb_bh->b_blocknr;
+		}
 	}
-exit_put:
+exit:
+	return err;
+}
+
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+				    struct ext4_new_flex_group_data *flex_gd,
+				    ext4_fsblk_t n_blocks_count,
+				    unsigned long flexbg_size)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_new_group_data *group_data = flex_gd->groups;
+	ext4_fsblk_t o_blocks_count;
+	ext4_group_t n_group;
+	ext4_group_t group;
+	ext4_group_t last_group;
+	ext4_grpblk_t last;
+	ext4_grpblk_t blocks_per_group;
+	unsigned long i;
+
+	blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+
+	o_blocks_count = ext4_blocks_count(es);
+
+	if (o_blocks_count == n_blocks_count)
+		return 0;
+
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+	BUG_ON(last);
+	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+
+	last_group = group | (flexbg_size - 1);
+	if (last_group > n_group)
+		last_group = n_group;
+
+	flex_gd->count = last_group - group + 1;
+
+	for (i = 0; i < flex_gd->count; i++) {
+		int overhead;
+
+		group_data[i].group = group + i;
+		group_data[i].blocks_count = blocks_per_group;
+		overhead = ext4_group_overhead_blocks(sb, group + i);
+		group_data[i].free_blocks_count = blocks_per_group - overhead;
+		if (ext4_has_group_desc_csum(sb)) {
+			flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+					       EXT4_BG_INODE_UNINIT;
+			if (!test_opt(sb, INIT_INODE_TABLE))
+				flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED;
+		} else
+			flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+	}
+
+	if (last_group == n_group && ext4_has_group_desc_csum(sb))
+		/* We need to initialize block bitmap of last group. */
+		flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+
+	if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+		group_data[i - 1].blocks_count = last + 1;
+		group_data[i - 1].free_blocks_count -= blocks_per_group-
+					last - 1;
+	}
+
+	return 1;
+}
+
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock.  Prior to that we have
+ * not really "added" the group at all.  We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+{
+	struct ext4_new_flex_group_data flex_gd;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+	struct inode *inode = NULL;
+	int gdb_off;
+	int err;
+	__u16 bg_flags = 0;
+
+	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
+
+	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ext4_warning(sb, "Can't resize non-sparse filesystem further");
+		return -EPERM;
+	}
+
+	if (ext4_blocks_count(es) + input->blocks_count <
+	    ext4_blocks_count(es)) {
+		ext4_warning(sb, "blocks_count overflow");
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
+	    le32_to_cpu(es->s_inodes_count)) {
+		ext4_warning(sb, "inodes_count overflow");
+		return -EINVAL;
+	}
+
+	if (reserved_gdb || gdb_off == 0) {
+		if (!EXT4_HAS_COMPAT_FEATURE(sb,
+					     EXT4_FEATURE_COMPAT_RESIZE_INODE)
+		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+			ext4_warning(sb,
+				     "No reserved GDT blocks, can't resize");
+			return -EPERM;
+		}
+		inode = ext4_iget(sb, EXT4_RESIZE_INO);
+		if (IS_ERR(inode)) {
+			ext4_warning(sb, "Error opening resize inode");
+			return PTR_ERR(inode);
+		}
+	}
+
+
+	err = verify_group_input(sb, input);
+	if (err)
+		goto out;
+
+	err = ext4_alloc_flex_bg_array(sb, input->group + 1);
+	if (err)
+		goto out;
+
+	err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
+	if (err)
+		goto out;
+
+	flex_gd.count = 1;
+	flex_gd.groups = input;
+	flex_gd.bg_flags = &bg_flags;
+	err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
 	iput(inode);
 	return err;
 } /* ext4_group_add */
 
-/* Extend the filesystem to the new number of blocks specified.  This entry
+/*
+ * extend a group without checking assuming that checking has been done.
+ */
+static int ext4_group_extend_no_check(struct super_block *sb,
+				      ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	handle_t *handle;
+	int err = 0, err2;
+
+	/* We will update the superblock, one block bitmap, and
+	 * one group descriptor via ext4_group_add_blocks().
+	 */
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		ext4_warning(sb, "error %d on journal start", err);
+		return err;
+	}
+
+	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (err) {
+		ext4_warning(sb, "error %d on journal write access", err);
+		goto errout;
+	}
+
+	ext4_blocks_count_set(es, o_blocks_count + add);
+	ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
+	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+	/* We add the blocks to the bitmap and set the group need init bit */
+	err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
+	if (err)
+		goto errout;
+	ext4_handle_dirty_super(handle, sb);
+	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+errout:
+	err2 = ext4_journal_stop(handle);
+	if (err2 && !err)
+		err = err2;
+
+	if (!err) {
+		if (test_opt(sb, DEBUG))
+			printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+			       "blocks\n", ext4_blocks_count(es));
+		update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+			       (char *)es, sizeof(struct ext4_super_block), 0);
+	}
+	return err;
+}
+
+/*
+ * Extend the filesystem to the new number of blocks specified.  This entry
  * point is only used to extend the current filesystem to the end of the last
  * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
  * for emergencies (because it has no dependencies on reserved blocks).
@@ -951,56 +1703,48 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		      ext4_fsblk_t n_blocks_count)
 {
 	ext4_fsblk_t o_blocks_count;
-	ext4_group_t o_groups_count;
 	ext4_grpblk_t last;
 	ext4_grpblk_t add;
-	struct buffer_head * bh;
-	handle_t *handle;
+	struct buffer_head *bh;
 	int err;
-	unsigned long freed_blocks;
+	ext4_group_t group;
 
-	/* We don't need to worry about locking wrt other resizers just
-	 * yet: we're going to revalidate es->s_blocks_count after
-	 * taking lock_super() below. */
 	o_blocks_count = ext4_blocks_count(es);
-	o_groups_count = EXT4_SB(sb)->s_groups_count;
 
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
-		       o_blocks_count, n_blocks_count);
+		ext4_msg(sb, KERN_DEBUG,
+			 "extending last group from %llu to %llu blocks",
+			 o_blocks_count, n_blocks_count);
 
 	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
 		return 0;
 
 	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
-			" too large to resize to %llu blocks safely\n",
-			sb->s_id, n_blocks_count);
+		ext4_msg(sb, KERN_ERR,
+			 "filesystem too large to resize to %llu blocks safely",
+			 n_blocks_count);
 		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, __FUNCTION__,
-			"CONFIG_LBD not enabled\n");
+			ext4_warning(sb, "CONFIG_LBDAF not enabled");
 		return -EINVAL;
 	}
 
 	if (n_blocks_count < o_blocks_count) {
-		ext4_warning(sb, __FUNCTION__,
-			     "can't shrink FS - resize aborted");
-		return -EBUSY;
+		ext4_warning(sb, "can't shrink FS - resize aborted");
+		return -EINVAL;
 	}
 
 	/* Handle the remaining blocks in the last group only. */
-	ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
 
 	if (last == 0) {
-		ext4_warning(sb, __FUNCTION__,
-			     "need to use ext2online to resize further");
+		ext4_warning(sb, "need to use ext2online to resize further");
 		return -EPERM;
 	}
 
 	add = EXT4_BLOCKS_PER_GROUP(sb) - last;
 
 	if (o_blocks_count + add < o_blocks_count) {
-		ext4_warning(sb, __FUNCTION__, "blocks_count overflow");
+		ext4_warning(sb, "blocks_count overflow");
 		return -EINVAL;
 	}
 
@@ -1008,64 +1752,269 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		add = n_blocks_count - o_blocks_count;
 
 	if (o_blocks_count + add < n_blocks_count)
-		ext4_warning(sb, __FUNCTION__,
-			     "will only finish group (%llu"
-			     " blocks, %u new)",
+		ext4_warning(sb, "will only finish group (%llu blocks, %u new)",
 			     o_blocks_count + add, add);
 
 	/* See if the device is actually as big as what was requested */
-	bh = sb_bread(sb, o_blocks_count + add -1);
+	bh = sb_bread(sb, o_blocks_count + add - 1);
 	if (!bh) {
-		ext4_warning(sb, __FUNCTION__,
-			     "can't read last block, resize aborted");
+		ext4_warning(sb, "can't read last block, resize aborted");
 		return -ENOSPC;
 	}
 	brelse(bh);
 
-	/* We will update the superblock, one block bitmap, and
-	 * one group descriptor via ext4_free_blocks().
-	 */
-	handle = ext4_journal_start_sb(sb, 3);
-	if (IS_ERR(handle)) {
-		err = PTR_ERR(handle);
-		ext4_warning(sb, __FUNCTION__, "error %d on journal start",err);
-		goto exit_put;
+	err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+	return err;
+} /* ext4_group_extend */
+
+
+static int num_desc_blocks(struct super_block *sb, ext4_group_t groups)
+{
+	return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb);
+}
+
+/*
+ * Release the resize inode and drop the resize_inode feature if there
+ * are no more reserved gdt blocks, and then convert the file system
+ * to enable meta_bg
+ */
+static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
+{
+	handle_t *handle;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_fsblk_t nr;
+	int i, ret, err = 0;
+	int credits = 1;
+
+	ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg");
+	if (inode) {
+		if (es->s_reserved_gdt_blocks) {
+			ext4_error(sb, "Unexpected non-zero "
+				   "s_reserved_gdt_blocks");
+			return -EPERM;
+		}
+
+		/* Do a quick sanity check of the resize inode */
+		if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
+			goto invalid_resize_inode;
+		for (i = 0; i < EXT4_N_BLOCKS; i++) {
+			if (i == EXT4_DIND_BLOCK) {
+				if (ei->i_data[i])
+					continue;
+				else
+					goto invalid_resize_inode;
+			}
+			if (ei->i_data[i])
+				goto invalid_resize_inode;
+		}
+		credits += 3;	/* block bitmap, bg descriptor, resize inode */
 	}
 
-	lock_super(sb);
-	if (o_blocks_count != ext4_blocks_count(es)) {
-		ext4_warning(sb, __FUNCTION__,
-			     "multiple resizers run on filesystem!");
-		unlock_super(sb);
-		ext4_journal_stop(handle);
-		err = -EBUSY;
-		goto exit_put;
+	handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+	if (err)
+		goto errout;
+
+	EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+	sbi->s_es->s_first_meta_bg =
+		cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
+
+	err = ext4_handle_dirty_super(handle, sb);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto errout;
 	}
 
-	if ((err = ext4_journal_get_write_access(handle,
-						 EXT4_SB(sb)->s_sbh))) {
-		ext4_warning(sb, __FUNCTION__,
-			     "error %d on journal write access", err);
-		unlock_super(sb);
-		ext4_journal_stop(handle);
-		goto exit_put;
+	if (inode) {
+		nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]);
+		ext4_free_blocks(handle, inode, NULL, nr, 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
+		ei->i_data[EXT4_DIND_BLOCK] = 0;
+		inode->i_blocks = 0;
+
+		err = ext4_mark_inode_dirty(handle, inode);
+		if (err)
+			ext4_std_error(sb, err);
 	}
-	ext4_blocks_count_set(es, o_blocks_count + add);
-	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
-	sb->s_dirt = 1;
-	unlock_super(sb);
-	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
-		   o_blocks_count + add);
-	ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
-		   o_blocks_count + add);
-	if ((err = ext4_journal_stop(handle)))
-		goto exit_put;
-	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
-		       ext4_blocks_count(es));
-	update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
-		       sizeof(struct ext4_super_block));
-exit_put:
+
+errout:
+	ret = ext4_journal_stop(handle);
+	if (!err)
+		err = ret;
+	return ret;
+
+invalid_resize_inode:
+	ext4_error(sb, "corrupted/inconsistent resize inode");
+	return -EINVAL;
+}
+
+/*
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
+ *
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+	struct ext4_new_flex_group_data *flex_gd = NULL;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head *bh;
+	struct inode *resize_inode = NULL;
+	ext4_grpblk_t add, offset;
+	unsigned long n_desc_blocks;
+	unsigned long o_desc_blocks;
+	ext4_group_t o_group;
+	ext4_group_t n_group;
+	ext4_fsblk_t o_blocks_count;
+	ext4_fsblk_t n_blocks_count_retry = 0;
+	unsigned long last_update_time = 0;
+	int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+	int meta_bg;
+
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, n_blocks_count - 1);
+	if (!bh) {
+		ext4_warning(sb, "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+retry:
+	o_blocks_count = ext4_blocks_count(es);
+
+	ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu "
+		 "to %llu blocks", o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count < o_blocks_count) {
+		/* On-line shrinking not supported */
+		ext4_warning(sb, "can't shrink FS - resize aborted");
+		return -EINVAL;
+	}
+
+	if (n_blocks_count == o_blocks_count)
+		/* Nothing need to do */
+		return 0;
+
+	n_group = ext4_get_group_number(sb, n_blocks_count - 1);
+	if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+		ext4_warning(sb, "resize would cause inodes_count overflow");
+		return -EINVAL;
+	}
+	ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
+
+	n_desc_blocks = num_desc_blocks(sb, n_group + 1);
+	o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
+
+	meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+		if (meta_bg) {
+			ext4_error(sb, "resize_inode and meta_bg enabled "
+				   "simultaneously");
+			return -EINVAL;
+		}
+		if (n_desc_blocks > o_desc_blocks +
+		    le16_to_cpu(es->s_reserved_gdt_blocks)) {
+			n_blocks_count_retry = n_blocks_count;
+			n_desc_blocks = o_desc_blocks +
+				le16_to_cpu(es->s_reserved_gdt_blocks);
+			n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
+			n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb);
+			n_group--; /* set to last group number */
+		}
+
+		if (!resize_inode)
+			resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+		if (IS_ERR(resize_inode)) {
+			ext4_warning(sb, "Error opening resize inode");
+			return PTR_ERR(resize_inode);
+		}
+	}
+
+	if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+		err = ext4_convert_meta_bg(sb, resize_inode);
+		if (err)
+			goto out;
+		if (resize_inode) {
+			iput(resize_inode);
+			resize_inode = NULL;
+		}
+		if (n_blocks_count_retry) {
+			n_blocks_count = n_blocks_count_retry;
+			n_blocks_count_retry = 0;
+			goto retry;
+		}
+	}
+
+	/* extend the last group */
+	if (n_group == o_group)
+		add = n_blocks_count - o_blocks_count;
+	else
+		add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+	if (add > 0) {
+		err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+		if (err)
+			goto out;
+	}
+
+	if (ext4_blocks_count(es) == n_blocks_count)
+		goto out;
+
+	err = ext4_alloc_flex_bg_array(sb, n_group + 1);
+	if (err)
+		return err;
+
+	err = ext4_mb_alloc_groupinfo(sb, n_group + 1);
+	if (err)
+		goto out;
+
+	flex_gd = alloc_flex_gd(flexbg_size);
+	if (flex_gd == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/* Add flex groups. Note that a regular group is a
+	 * flex group with 1 group.
+	 */
+	while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+					      flexbg_size)) {
+		if (jiffies - last_update_time > HZ * 10) {
+			if (last_update_time)
+				ext4_msg(sb, KERN_INFO,
+					 "resized to %llu blocks",
+					 ext4_blocks_count(es));
+			last_update_time = jiffies;
+		}
+		if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0)
+			break;
+		err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+		if (unlikely(err))
+			break;
+	}
+
+	if (!err && n_blocks_count_retry) {
+		n_blocks_count = n_blocks_count_retry;
+		n_blocks_count_retry = 0;
+		free_flex_gd(flex_gd);
+		flex_gd = NULL;
+		goto retry;
+	}
+
+out:
+	if (flex_gd)
+		free_flex_gd(flex_gd);
+	if (resize_inode != NULL)
+		iput(resize_inode);
+	ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
 	return err;
-} /* ext4_group_extend */
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 13383ba18f1..6df7bc611db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,14 +20,12 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
-#include <linux/ext4_jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/vfs.h>
@@ -36,43 +34,166 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
-
+#include <linux/cleancache.h>
 #include <asm/uaccess.h>
 
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include "ext4.h"
+#include "ext4_extents.h"	/* Needed for trace points definition */
+#include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "namei.h"
-#include "group.h"
+#include "mballoc.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
+
+static struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
+static struct ext4_lazy_init *ext4_li_info;
+static struct mutex ext4_li_mtx;
+static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
-static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
-			       unsigned int);
-static void ext4_commit_super (struct super_block * sb,
-			       struct ext4_super_block * es,
-			       int sync);
-static void ext4_mark_recovery_complete(struct super_block * sb,
-					struct ext4_super_block * es);
-static void ext4_clear_journal_err(struct super_block * sb,
-				   struct ext4_super_block * es);
+static int ext4_show_options(struct seq_file *seq, struct dentry *root);
+static int ext4_commit_super(struct super_block *sb, int sync);
+static void ext4_mark_recovery_complete(struct super_block *sb,
+					struct ext4_super_block *es);
+static void ext4_clear_journal_err(struct super_block *sb,
+				   struct ext4_super_block *es);
 static int ext4_sync_fs(struct super_block *sb, int wait);
-static const char *ext4_decode_error(struct super_block * sb, int errno,
-				     char nbuf[16]);
-static int ext4_remount (struct super_block * sb, int * flags, char * data);
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
-static void ext4_unlockfs(struct super_block *sb);
-static void ext4_write_super (struct super_block * sb);
-static void ext4_write_super_lockfs(struct super_block *sb);
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
+static int ext4_remount(struct super_block *sb, int *flags, char *data);
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
+static int ext4_unfreeze(struct super_block *sb);
+static int ext4_freeze(struct super_block *sb);
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
+static void ext4_destroy_lazyinit_thread(void);
+static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext2",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext2");
+MODULE_ALIAS("ext2");
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext3",
+	.mount		= ext4_mount,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ext3");
+MODULE_ALIAS("ext3");
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
+
+static int ext4_verify_csum_type(struct super_block *sb,
+				 struct ext4_super_block *es)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
+
+	return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
+}
+
+static __le32 ext4_superblock_csum(struct super_block *sb,
+				   struct ext4_super_block *es)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int offset = offsetof(struct ext4_super_block, s_checksum);
+	__u32 csum;
+
+	csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+
+	return cpu_to_le32(csum);
+}
+
+static int ext4_superblock_csum_verify(struct super_block *sb,
+				       struct ext4_super_block *es)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return 1;
+
+	return es->s_checksum == ext4_superblock_csum(sb, es);
+}
+
+void ext4_superblock_csum_set(struct super_block *sb)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return;
+
+	es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
+void *ext4_kvmalloc(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	ret = kmalloc(size, flags | __GFP_NOWARN);
+	if (!ret)
+		ret = __vmalloc(size, flags, PAGE_KERNEL);
+	return ret;
+}
+
+void *ext4_kvzalloc(size_t size, gfp_t flags)
+{
+	void *ret;
+
+	ret = kzalloc(size, flags | __GFP_NOWARN);
+	if (!ret)
+		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+	return ret;
+}
+
+void ext4_kvfree(void *ptr)
+{
+	if (is_vmalloc_addr(ptr))
+		vfree(ptr);
+	else
+		kfree(ptr);
 
+}
 
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
 {
 	return le32_to_cpu(bg->bg_block_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -80,7 +201,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -88,7 +209,39 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_inode_table_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+}
+
+__u32 ext4_free_group_clusters(struct super_block *sb,
+			       struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_itable_unused_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 }
 
 void ext4_block_bitmap_set(struct super_block *sb,
@@ -115,76 +268,107 @@ void ext4_inode_table_set(struct super_block *sb,
 		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
 
-/*
- * Wrappers for jbd2_journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
-handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
+void ext4_free_group_clusters_set(struct super_block *sb,
+				  struct ext4_group_desc *bg, __u32 count)
 {
-	journal_t *journal;
-
-	if (sb->s_flags & MS_RDONLY)
-		return ERR_PTR(-EROFS);
-
-	/* Special case here: if the journal has aborted behind our
-	 * backs (eg. EIO in the commit thread), then we still need to
-	 * take the FS itself readonly cleanly. */
-	journal = EXT4_SB(sb)->s_journal;
-	if (is_journal_aborted(journal)) {
-		ext4_abort(sb, __FUNCTION__,
-			   "Detected aborted journal");
-		return ERR_PTR(-EROFS);
-	}
-
-	return jbd2_journal_start(journal, nblocks);
+	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
 }
 
-/*
- * The only special thing we need to do here is to make sure that all
- * jbd2_journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
-int __ext4_journal_stop(const char *where, handle_t *handle)
+void ext4_free_inodes_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
 {
-	struct super_block *sb;
-	int err;
-	int rc;
+	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
 
-	sb = handle->h_transaction->t_journal->j_private;
-	err = handle->h_err;
-	rc = jbd2_journal_stop(handle);
+void ext4_used_dirs_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
 
-	if (!err)
-		err = rc;
-	if (err)
-		__ext4_std_error(sb, where, err);
-	return err;
+void ext4_itable_unused_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 }
 
-void ext4_journal_abort_handle(const char *caller, const char *err_fn,
-		struct buffer_head *bh, handle_t *handle, int err)
+
+static void __save_error_info(struct super_block *sb, const char *func,
+			    unsigned int line)
 {
-	char nbuf[16];
-	const char *errstr = ext4_decode_error(NULL, err, nbuf);
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 
-	if (bh)
-		BUFFER_TRACE(bh, "abort");
+	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+	es->s_last_error_time = cpu_to_le32(get_seconds());
+	strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
+	es->s_last_error_line = cpu_to_le32(line);
+	if (!es->s_first_error_time) {
+		es->s_first_error_time = es->s_last_error_time;
+		strncpy(es->s_first_error_func, func,
+			sizeof(es->s_first_error_func));
+		es->s_first_error_line = cpu_to_le32(line);
+		es->s_first_error_ino = es->s_last_error_ino;
+		es->s_first_error_block = es->s_last_error_block;
+	}
+	/*
+	 * Start the daily error reporting function if it hasn't been
+	 * started already
+	 */
+	if (!es->s_error_count)
+		mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
+	le32_add_cpu(&es->s_error_count, 1);
+}
 
-	if (!handle->h_err)
-		handle->h_err = err;
+static void save_error_info(struct super_block *sb, const char *func,
+			    unsigned int line)
+{
+	__save_error_info(sb, func, line);
+	ext4_commit_super(sb, 1);
+}
 
-	if (is_handle_aborted(handle))
-		return;
+/*
+ * The del_gendisk() function uninitializes the disk-specific data
+ * structures, including the bdi structure, without telling anyone
+ * else.  Once this happens, any attempt to call mark_buffer_dirty()
+ * (for example, by ext4_commit_super), will cause a kernel OOPS.
+ * This is a kludge to prevent these oops until we can put in a proper
+ * hook in del_gendisk() to inform the VFS and file system layers.
+ */
+static int block_device_ejected(struct super_block *sb)
+{
+	struct inode *bd_inode = sb->s_bdev->bd_inode;
+	struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
 
-	printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
-	       caller, errstr, err_fn);
+	return bdi->dev == NULL;
+}
 
-	jbd2_journal_abort_handle(handle);
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+	struct super_block		*sb = journal->j_private;
+	struct ext4_sb_info		*sbi = EXT4_SB(sb);
+	int				error = is_journal_aborted(journal);
+	struct ext4_journal_cb_entry	*jce;
+
+	BUG_ON(txn->t_state == T_FINISHED);
+	spin_lock(&sbi->s_md_lock);
+	while (!list_empty(&txn->t_private_list)) {
+		jce = list_entry(txn->t_private_list.next,
+				 struct ext4_journal_cb_entry, jce_list);
+		list_del_init(&jce->jce_list);
+		spin_unlock(&sbi->s_md_lock);
+		jce->jce_func(sb, jce, error);
+		spin_lock(&sbi->s_md_lock);
+	}
+	spin_unlock(&sbi->s_md_lock);
 }
 
 /* Deal with the reporting of failure conditions on a filesystem such as
@@ -198,53 +382,128 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
  * write out the superblock safely.
  *
  * We'll just use the jbd2_journal_abort() error code to record an error in
- * the journal instead.  On recovery, the journal will compain about
+ * the journal instead.  On recovery, the journal will complain about
  * that error until we've noted it down and cleared it.
  */
 
 static void ext4_handle_error(struct super_block *sb)
 {
-	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-
-	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	if (!test_opt (sb, ERRORS_CONT)) {
+	if (!test_opt(sb, ERRORS_CONT)) {
 		journal_t *journal = EXT4_SB(sb)->s_journal;
 
-		EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 		if (journal)
 			jbd2_journal_abort(journal, -EIO);
 	}
-	if (test_opt (sb, ERRORS_RO)) {
-		printk (KERN_CRIT "Remounting filesystem read-only\n");
+	if (test_opt(sb, ERRORS_RO)) {
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+		/*
+		 * Make sure updated value of ->s_mount_flags will be visible
+		 * before ->s_flags update
+		 */
+		smp_wmb();
 		sb->s_flags |= MS_RDONLY;
 	}
-	ext4_commit_super(sb, es, 1);
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs (device %s): panic forced after error\n",
 			sb->s_id);
 }
 
-void ext4_error (struct super_block * sb, const char * function,
-		 const char * fmt, ...)
+#define ext4_error_ratelimit(sb)					\
+		___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),	\
+			     "EXT4-fs error")
+
+void __ext4_error(struct super_block *sb, const char *function,
+		  unsigned int line, const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
-	va_start(args, fmt);
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
-	vprintk(fmt, args);
-	printk("\n");
-	va_end(args);
-
+	if (ext4_error_ratelimit(sb)) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		printk(KERN_CRIT
+		       "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+		       sb->s_id, function, line, current->comm, &vaf);
+		va_end(args);
+	}
+	save_error_info(sb, function, line);
 	ext4_handle_error(sb);
 }
 
-static const char *ext4_decode_error(struct super_block * sb, int errno,
-				     char nbuf[16])
+void __ext4_error_inode(struct inode *inode, const char *function,
+			unsigned int line, ext4_fsblk_t block,
+			const char *fmt, ...)
+{
+	va_list args;
+	struct va_format vaf;
+	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+
+	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+	es->s_last_error_block = cpu_to_le64(block);
+	if (ext4_error_ratelimit(inode->i_sb)) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		if (block)
+			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+			       "inode #%lu: block %llu: comm %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       block, current->comm, &vaf);
+		else
+			printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+			       "inode #%lu: comm %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       current->comm, &vaf);
+		va_end(args);
+	}
+	save_error_info(inode->i_sb, function, line);
+	ext4_handle_error(inode->i_sb);
+}
+
+void __ext4_error_file(struct file *file, const char *function,
+		       unsigned int line, ext4_fsblk_t block,
+		       const char *fmt, ...)
+{
+	va_list args;
+	struct va_format vaf;
+	struct ext4_super_block *es;
+	struct inode *inode = file_inode(file);
+	char pathname[80], *path;
+
+	es = EXT4_SB(inode->i_sb)->s_es;
+	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+	if (ext4_error_ratelimit(inode->i_sb)) {
+		path = d_path(&(file->f_path), pathname, sizeof(pathname));
+		if (IS_ERR(path))
+			path = "(unknown)";
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		if (block)
+			printk(KERN_CRIT
+			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+			       "block %llu: comm %s: path %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       block, current->comm, path, &vaf);
+		else
+			printk(KERN_CRIT
+			       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+			       "comm %s: path %s: %pV\n",
+			       inode->i_sb->s_id, function, line, inode->i_ino,
+			       current->comm, path, &vaf);
+		va_end(args);
+	}
+	save_error_info(inode->i_sb, function, line);
+	ext4_handle_error(inode->i_sb);
+}
+
+const char *ext4_decode_error(struct super_block *sb, int errno,
+			      char nbuf[16])
 {
 	char *errstr = NULL;
 
@@ -256,7 +515,8 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
 		errstr = "Out of memory";
 		break;
 	case -EROFS:
-		if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+		if (!sb || (EXT4_SB(sb)->s_journal &&
+			    EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 			errstr = "Journal has aborted";
 		else
 			errstr = "Readonly filesystem";
@@ -279,8 +539,8 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
 /* __ext4_std_error decodes expected errors from journaling functions
  * automatically and invokes the appropriate error response.  */
 
-void __ext4_std_error (struct super_block * sb, const char * function,
-		       int errno)
+void __ext4_std_error(struct super_block *sb, const char *function,
+		      unsigned int line, int errno)
 {
 	char nbuf[16];
 	const char *errstr;
@@ -292,10 +552,13 @@ void __ext4_std_error (struct super_block * sb, const char * function,
 	    (sb->s_flags & MS_RDONLY))
 		return;
 
-	errstr = ext4_decode_error(sb, errno, nbuf);
-	printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
-		sb->s_id, function, errstr);
+	if (ext4_error_ratelimit(sb)) {
+		errstr = ext4_decode_error(sb, errno, nbuf);
+		printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+		       sb->s_id, function, line, errstr);
+	}
 
+	save_error_info(sb, function, line);
 	ext4_handle_error(sb);
 }
 
@@ -309,45 +572,122 @@ void __ext4_std_error (struct super_block * sb, const char * function,
  * case we take the easy way out and panic immediately.
  */
 
-void ext4_abort (struct super_block * sb, const char * function,
-		 const char * fmt, ...)
+void __ext4_abort(struct super_block *sb, const char *function,
+		unsigned int line, const char *fmt, ...)
 {
 	va_list args;
 
-	printk (KERN_CRIT "ext4_abort called.\n");
-
+	save_error_info(sb, function, line);
 	va_start(args, fmt);
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
+	       function, line);
 	vprintk(fmt, args);
 	printk("\n");
 	va_end(args);
 
+	if ((sb->s_flags & MS_RDONLY) == 0) {
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+		EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		/*
+		 * Make sure updated value of ->s_mount_flags will be visible
+		 * before ->s_flags update
+		 */
+		smp_wmb();
+		sb->s_flags |= MS_RDONLY;
+		if (EXT4_SB(sb)->s_journal)
+			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+		save_error_info(sb, function, line);
+	}
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs panic from previous error\n");
+}
 
-	if (sb->s_flags & MS_RDONLY)
+void __ext4_msg(struct super_block *sb,
+		const char *prefix, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
 		return;
 
-	printk(KERN_CRIT "Remounting filesystem read-only\n");
-	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
-	sb->s_flags |= MS_RDONLY;
-	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
-	jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
+	va_end(args);
 }
 
-void ext4_warning (struct super_block * sb, const char * function,
-		   const char * fmt, ...)
+void __ext4_warning(struct super_block *sb, const char *function,
+		    unsigned int line, const char *fmt, ...)
 {
+	struct va_format vaf;
 	va_list args;
 
+	if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+			  "EXT4-fs warning"))
+		return;
+
 	va_start(args, fmt);
-	printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
-	       sb->s_id, function);
-	vprintk(fmt, args);
-	printk("\n");
+	vaf.fmt = fmt;
+	vaf.va = &args;
+	printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
+	       sb->s_id, function, line, &vaf);
 	va_end(args);
 }
 
+void __ext4_grp_locked_error(const char *function, unsigned int line,
+			     struct super_block *sb, ext4_group_t grp,
+			     unsigned long ino, ext4_fsblk_t block,
+			     const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+	struct va_format vaf;
+	va_list args;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	es->s_last_error_ino = cpu_to_le32(ino);
+	es->s_last_error_block = cpu_to_le64(block);
+	__save_error_info(sb, function, line);
+
+	if (ext4_error_ratelimit(sb)) {
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
+		       sb->s_id, function, line, grp);
+		if (ino)
+			printk(KERN_CONT "inode %lu: ", ino);
+		if (block)
+			printk(KERN_CONT "block %llu:",
+			       (unsigned long long) block);
+		printk(KERN_CONT "%pV\n", &vaf);
+		va_end(args);
+	}
+
+	if (test_opt(sb, ERRORS_CONT)) {
+		ext4_commit_super(sb, 0);
+		return;
+	}
+
+	ext4_unlock_group(sb, grp);
+	ext4_handle_error(sb);
+	/*
+	 * We only get here in the ERRORS_RO case; relocking the group
+	 * may be dangerous, but nothing bad will happen since the
+	 * filesystem will have already been marked read/only and the
+	 * journal has been aborted.  We return 1 as a hint to callers
+	 * who might what to use the return value from
+	 * ext4_grp_locked_error() to distinguish between the
+	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+	 * aggressively from the ext4 function in question, with a
+	 * more appropriate error code.
+	 */
+	ext4_lock_group(sb, grp);
+	return;
+}
+
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -355,7 +695,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
 		return;
 
-	ext4_warning(sb, __FUNCTION__,
+	ext4_warning(sb,
 		     "updating to rev %d because of new feature flag, "
 		     "running e2fsck is recommended",
 		     EXT4_DYNAMIC_REV);
@@ -373,81 +713,21 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 	 */
 }
 
-int ext4_update_compat_feature(handle_t *handle,
-					struct super_block *sb, __u32 compat)
-{
-	int err = 0;
-	if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
-		err = ext4_journal_get_write_access(handle,
-				EXT4_SB(sb)->s_sbh);
-		if (err)
-			return err;
-		EXT4_SET_COMPAT_FEATURE(sb, compat);
-		sb->s_dirt = 1;
-		handle->h_sync = 1;
-		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-					"call ext4_journal_dirty_met adata");
-		err = ext4_journal_dirty_metadata(handle,
-				EXT4_SB(sb)->s_sbh);
-	}
-	return err;
-}
-
-int ext4_update_rocompat_feature(handle_t *handle,
-					struct super_block *sb, __u32 rocompat)
-{
-	int err = 0;
-	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
-		err = ext4_journal_get_write_access(handle,
-				EXT4_SB(sb)->s_sbh);
-		if (err)
-			return err;
-		EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
-		sb->s_dirt = 1;
-		handle->h_sync = 1;
-		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-					"call ext4_journal_dirty_met adata");
-		err = ext4_journal_dirty_metadata(handle,
-				EXT4_SB(sb)->s_sbh);
-	}
-	return err;
-}
-
-int ext4_update_incompat_feature(handle_t *handle,
-					struct super_block *sb, __u32 incompat)
-{
-	int err = 0;
-	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
-		err = ext4_journal_get_write_access(handle,
-				EXT4_SB(sb)->s_sbh);
-		if (err)
-			return err;
-		EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
-		sb->s_dirt = 1;
-		handle->h_sync = 1;
-		BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
-					"call ext4_journal_dirty_met adata");
-		err = ext4_journal_dirty_metadata(handle,
-				EXT4_SB(sb)->s_sbh);
-	}
-	return err;
-}
-
 /*
  * Open the external journal device
  */
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 {
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
 
 fail:
-	printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
 			__bdevname(dev, b), PTR_ERR(bdev));
 	return NULL;
 }
@@ -455,23 +735,19 @@ fail:
 /*
  * Release the journal device
  */
-static int ext4_blkdev_put(struct block_device *bdev)
+static void ext4_blkdev_put(struct block_device *bdev)
 {
-	bd_release(bdev);
-	return blkdev_put(bdev);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
-static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
 {
 	struct block_device *bdev;
-	int ret = -ENODEV;
-
 	bdev = sbi->journal_bdev;
 	if (bdev) {
-		ret = ext4_blkdev_put(bdev);
+		ext4_blkdev_put(bdev);
 		sbi->journal_bdev = NULL;
 	}
-	return ret;
 }
 
 static inline struct inode *orphan_list_entry(struct list_head *l)
@@ -483,8 +759,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 {
 	struct list_head *l;
 
-	printk(KERN_ERR "sb orphan head is %d\n",
-	       le32_to_cpu(sbi->s_es->s_last_orphan));
+	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+		 le32_to_cpu(sbi->s_es->s_last_orphan));
 
 	printk(KERN_ERR "sb_info orphan list:\n");
 	list_for_each(l, &sbi->s_orphan) {
@@ -497,30 +773,54 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 	}
 }
 
-static void ext4_put_super (struct super_block * sb)
+static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	int i;
+	int i, err;
+
+	ext4_unregister_li_request(sb);
+	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+
+	flush_workqueue(sbi->rsv_conversion_wq);
+	destroy_workqueue(sbi->rsv_conversion_wq);
+
+	if (sbi->s_journal) {
+		err = jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+		if (err < 0)
+			ext4_abort(sb, "Couldn't clean up the journal");
+	}
 
+	ext4_es_unregister_shrinker(sbi);
+	del_timer_sync(&sbi->s_err_report);
+	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
-	jbd2_journal_destroy(sbi->s_journal);
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		BUFFER_TRACE(sbi->s_sbh, "marking dirty");
-		mark_buffer_dirty(sbi->s_sbh);
-		ext4_commit_super(sb, es, 1);
 	}
+	if (!(sb->s_flags & MS_RDONLY))
+		ext4_commit_super(sb, 1);
+
+	if (sbi->s_proc) {
+		remove_proc_entry("options", sbi->s_proc);
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
+	kobject_del(&sbi->s_kobj);
 
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
-	kfree(sbi->s_group_desc);
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	ext4_kvfree(sbi->s_group_desc);
+	ext4_kvfree(sbi->s_flex_groups);
+	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -546,9 +846,23 @@ static void ext4_put_super (struct super_block * sb)
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
+	if (sbi->s_mb_cache) {
+		ext4_xattr_destroy_cache(sbi->s_mb_cache);
+		sbi->s_mb_cache = NULL;
+	}
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 	sb->s_fs_info = NULL;
+	/*
+	 * Now that we are completely done shutting down the
+	 * superblock, we need to actually destroy the kobject.
+	 */
+	kobject_put(&sbi->s_kobj);
+	wait_for_completion(&sbi->s_kobj_unregister);
+	if (sbi->s_chksum_driver)
+		crypto_free_shash(sbi->s_chksum_driver);
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	return;
 }
 
 static struct kmem_cache *ext4_inode_cachep;
@@ -563,44 +877,76 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-	ei->i_acl = EXT4_ACL_NOT_CACHED;
-	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
-#endif
-	ei->i_block_alloc_info = NULL;
+
 	ei->vfs_inode.i_version = 1;
-	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+	spin_lock_init(&ei->i_raw_lock);
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
+	ext4_es_init_tree(&ei->i_es_tree);
+	rwlock_init(&ei->i_es_lock);
+	INIT_LIST_HEAD(&ei->i_es_lru);
+	ei->i_es_lru_nr = 0;
+	ei->i_touch_when = 0;
+	ei->i_reserved_data_blocks = 0;
+	ei->i_reserved_meta_blocks = 0;
+	ei->i_allocated_meta_blocks = 0;
+	ei->i_da_metadata_calc_len = 0;
+	ei->i_da_metadata_calc_last_lblock = 0;
+	spin_lock_init(&(ei->i_block_reservation_lock));
+#ifdef CONFIG_QUOTA
+	ei->i_reserved_quota = 0;
+#endif
+	ei->jinode = NULL;
+	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+	spin_lock_init(&ei->i_completed_io_lock);
+	ei->i_sync_tid = 0;
+	ei->i_datasync_tid = 0;
+	atomic_set(&ei->i_ioend_count, 0);
+	atomic_set(&ei->i_unwritten, 0);
+	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+
 	return &ei->vfs_inode;
 }
 
+static int ext4_drop_inode(struct inode *inode)
+{
+	int drop = generic_drop_inode(inode);
+
+	trace_ext4_drop_inode(inode, drop);
+	return drop;
+}
+
+static void ext4_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
 static void ext4_destroy_inode(struct inode *inode)
 {
 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
-		printk("EXT4 Inode %p: orphan list check failed!\n",
-			EXT4_I(inode));
+		ext4_msg(inode->i_sb, KERN_ERR,
+			 "Inode %lu (%p): orphan list check failed!",
+			 inode->i_ino, EXT4_I(inode));
 		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
 				EXT4_I(inode), sizeof(struct ext4_inode_info),
 				true);
 		dump_stack();
 	}
-	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+	call_rcu(&inode->i_rcu, ext4_i_callback);
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
 	INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
 	init_rwsem(&ei->xattr_sem);
-#endif
 	init_rwsem(&ei->i_data_sem);
 	inode_init_once(&ei->vfs_inode);
 }
 
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
 					     sizeof(struct ext4_inode_info),
@@ -614,153 +960,32 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ext4_inode_cachep);
 }
 
-static void ext4_clear_inode(struct inode *inode)
+void ext4_clear_inode(struct inode *inode)
 {
-	struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-	if (EXT4_I(inode)->i_acl &&
-			EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
-		posix_acl_release(EXT4_I(inode)->i_acl);
-		EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
-	}
-	if (EXT4_I(inode)->i_default_acl &&
-			EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
-		posix_acl_release(EXT4_I(inode)->i_default_acl);
-		EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
+	invalidate_inode_buffers(inode);
+	clear_inode(inode);
+	dquot_drop(inode);
+	ext4_discard_preallocations(inode);
+	ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+	ext4_es_lru_del(inode);
+	if (EXT4_I(inode)->jinode) {
+		jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
+					       EXT4_I(inode)->jinode);
+		jbd2_free_inode(EXT4_I(inode)->jinode);
+		EXT4_I(inode)->jinode = NULL;
 	}
-#endif
-	ext4_discard_reservation(inode);
-	EXT4_I(inode)->i_block_alloc_info = NULL;
-	if (unlikely(rsv))
-		kfree(rsv);
-}
-
-static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
-{
-#if defined(CONFIG_QUOTA)
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-	if (sbi->s_jquota_fmt)
-		seq_printf(seq, ",jqfmt=%s",
-		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
-
-	if (sbi->s_qf_names[USRQUOTA])
-		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
-
-	if (sbi->s_qf_names[GRPQUOTA])
-		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
-	if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
-		seq_puts(seq, ",usrquota");
-
-	if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
-		seq_puts(seq, ",grpquota");
-#endif
 }
 
-/*
- * Show an option if
- *  - it's set to a non-default value OR
- *  - if the per-sb default is different from the global default
- */
-static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
-{
-	int def_errors;
-	unsigned long def_mount_opts;
-	struct super_block *sb = vfs->mnt_sb;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_super_block *es = sbi->s_es;
-
-	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-	def_errors     = le16_to_cpu(es->s_errors);
-
-	if (sbi->s_sb_block != 1)
-		seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
-	if (test_opt(sb, MINIX_DF))
-		seq_puts(seq, ",minixdf");
-	if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
-		seq_puts(seq, ",grpid");
-	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
-		seq_puts(seq, ",nogrpid");
-	if (sbi->s_resuid != EXT4_DEF_RESUID ||
-	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
-		seq_printf(seq, ",resuid=%u", sbi->s_resuid);
-	}
-	if (sbi->s_resgid != EXT4_DEF_RESGID ||
-	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
-		seq_printf(seq, ",resgid=%u", sbi->s_resgid);
-	}
-	if (test_opt(sb, ERRORS_RO)) {
-		if (def_errors == EXT4_ERRORS_PANIC ||
-		    def_errors == EXT4_ERRORS_CONTINUE) {
-			seq_puts(seq, ",errors=remount-ro");
-		}
-	}
-	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
-		seq_puts(seq, ",errors=continue");
-	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
-		seq_puts(seq, ",errors=panic");
-	if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
-		seq_puts(seq, ",nouid32");
-	if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
-		seq_puts(seq, ",debug");
-	if (test_opt(sb, OLDALLOC))
-		seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4DEV_FS_XATTR
-	if (test_opt(sb, XATTR_USER) &&
-		!(def_mount_opts & EXT4_DEFM_XATTR_USER))
-		seq_puts(seq, ",user_xattr");
-	if (!test_opt(sb, XATTR_USER) &&
-	    (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
-		seq_puts(seq, ",nouser_xattr");
-	}
-#endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
-		seq_puts(seq, ",acl");
-	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
-		seq_puts(seq, ",noacl");
-#endif
-	if (!test_opt(sb, RESERVATION))
-		seq_puts(seq, ",noreservation");
-	if (sbi->s_commit_interval) {
-		seq_printf(seq, ",commit=%u",
-			   (unsigned) (sbi->s_commit_interval / HZ));
-	}
-	if (test_opt(sb, BARRIER))
-		seq_puts(seq, ",barrier=1");
-	if (test_opt(sb, NOBH))
-		seq_puts(seq, ",nobh");
-	if (!test_opt(sb, EXTENTS))
-		seq_puts(seq, ",noextents");
-	if (!test_opt(sb, MBALLOC))
-		seq_puts(seq, ",nomballoc");
-	if (test_opt(sb, I_VERSION))
-		seq_puts(seq, ",i_version");
-
-	if (sbi->s_stripe)
-		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
-	/*
-	 * journal mode get enabled in different ways
-	 * So just print the value even if we didn't specify it
-	 */
-	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-		seq_puts(seq, ",data=journal");
-	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
-		seq_puts(seq, ",data=ordered");
-	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
-		seq_puts(seq, ",data=writeback");
-
-	ext4_show_quota_options(seq, sb);
-	return 0;
-}
-
-
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
+					u64 ino, u32 generation)
 {
 	struct inode *inode;
 
@@ -789,60 +1014,92 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 }
 
 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
+					int fh_len, int fh_type)
 {
 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
 }
 
 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
+					int fh_len, int fh_type)
 {
 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
 }
 
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
+{
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page,
+							wait & ~__GFP_WAIT);
+	return try_to_free_buffers(page);
+}
+
 #ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
-#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
-static int ext4_dquot_initialize(struct inode *inode, int type);
-static int ext4_dquot_drop(struct inode *inode);
 static int ext4_write_dquot(struct dquot *dquot);
 static int ext4_acquire_dquot(struct dquot *dquot);
 static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
-static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path);
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+			 struct path *path);
+static int ext4_quota_on_sysfile(struct super_block *sb, int type,
+				 int format_id);
+static int ext4_quota_off(struct super_block *sb, int type);
+static int ext4_quota_off_sysfile(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
 static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+			     unsigned int flags);
+static int ext4_enable_quotas(struct super_block *sb);
 
-static struct dquot_operations ext4_quota_operations = {
-	.initialize	= ext4_dquot_initialize,
-	.drop		= ext4_dquot_drop,
-	.alloc_space	= dquot_alloc_space,
-	.alloc_inode	= dquot_alloc_inode,
-	.free_space	= dquot_free_space,
-	.free_inode	= dquot_free_inode,
-	.transfer	= dquot_transfer,
+static const struct dquot_operations ext4_quota_operations = {
+	.get_reserved_space = ext4_get_reserved_space,
 	.write_dquot	= ext4_write_dquot,
 	.acquire_dquot	= ext4_acquire_dquot,
 	.release_dquot	= ext4_release_dquot,
 	.mark_dirty	= ext4_mark_dquot_dirty,
-	.write_info	= ext4_write_info
+	.write_info	= ext4_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 
-static struct quotactl_ops ext4_qctl_operations = {
+static const struct quotactl_ops ext4_qctl_operations = {
 	.quota_on	= ext4_quota_on,
-	.quota_off	= vfs_quota_off,
-	.quota_sync	= vfs_quota_sync,
-	.get_info	= vfs_get_dqinfo,
-	.set_info	= vfs_set_dqinfo,
-	.get_dqblk	= vfs_get_dqblk,
-	.set_dqblk	= vfs_set_dqblk
+	.quota_off	= ext4_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+
+static const struct quotactl_ops ext4_qctl_sysfile_operations = {
+	.quota_on_meta	= ext4_quota_on_sysfile,
+	.quota_off	= ext4_quota_off_sysfile,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
 };
 #endif
 
@@ -851,20 +1108,39 @@ static const struct super_operations ext4_sops = {
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
 	.dirty_inode	= ext4_dirty_inode,
-	.delete_inode	= ext4_delete_inode,
+	.drop_inode	= ext4_drop_inode,
+	.evict_inode	= ext4_evict_inode,
 	.put_super	= ext4_put_super,
-	.write_super	= ext4_write_super,
 	.sync_fs	= ext4_sync_fs,
-	.write_super_lockfs = ext4_write_super_lockfs,
-	.unlockfs	= ext4_unlockfs,
+	.freeze_fs	= ext4_freeze,
+	.unfreeze_fs	= ext4_unfreeze,
+	.statfs		= ext4_statfs,
+	.remount_fs	= ext4_remount,
+	.show_options	= ext4_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext4_quota_read,
+	.quota_write	= ext4_quota_write,
+#endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
+static const struct super_operations ext4_nojournal_sops = {
+	.alloc_inode	= ext4_alloc_inode,
+	.destroy_inode	= ext4_destroy_inode,
+	.write_inode	= ext4_write_inode,
+	.dirty_inode	= ext4_dirty_inode,
+	.drop_inode	= ext4_drop_inode,
+	.evict_inode	= ext4_evict_inode,
+	.sync_fs	= ext4_sync_fs_nojournal,
+	.put_super	= ext4_put_super,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
-	.clear_inode	= ext4_clear_inode,
 	.show_options	= ext4_show_options,
 #ifdef CONFIG_QUOTA
 	.quota_read	= ext4_quota_read,
 	.quota_write	= ext4_quota_write,
 #endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
 static const struct export_operations ext4_export_ops = {
@@ -876,20 +1152,26 @@ static const struct export_operations ext4_export_ops = {
 enum {
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+	Opt_nouid32, Opt_debug, Opt_removed,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
-	Opt_journal_checksum, Opt_journal_async_commit,
+	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
+	Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
+	Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-	Opt_mballoc, Opt_nomballoc, Opt_stripe,
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
+	Opt_usrquota, Opt_grpquota, Opt_i_version,
+	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
+	Opt_inode_readahead_blks, Opt_journal_ioprio,
+	Opt_dioread_nolock, Opt_dioread_lock,
+	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+	Opt_max_dir_size_kb,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_bsd_df, "bsddf"},
 	{Opt_minix_df, "minixdf"},
 	{Opt_grpid, "grpid"},
@@ -903,49 +1185,71 @@ static match_table_t tokens = {
 	{Opt_err_panic, "errors=panic"},
 	{Opt_err_ro, "errors=remount-ro"},
 	{Opt_nouid32, "nouid32"},
-	{Opt_nocheck, "nocheck"},
-	{Opt_nocheck, "check=none"},
 	{Opt_debug, "debug"},
-	{Opt_oldalloc, "oldalloc"},
-	{Opt_orlov, "orlov"},
+	{Opt_removed, "oldalloc"},
+	{Opt_removed, "orlov"},
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
-	{Opt_reservation, "reservation"},
-	{Opt_noreservation, "noreservation"},
+	{Opt_noload, "norecovery"},
 	{Opt_noload, "noload"},
-	{Opt_nobh, "nobh"},
-	{Opt_bh, "bh"},
+	{Opt_removed, "nobh"},
+	{Opt_removed, "bh"},
 	{Opt_commit, "commit=%u"},
-	{Opt_journal_update, "journal=update"},
-	{Opt_journal_inum, "journal=%u"},
+	{Opt_min_batch_time, "min_batch_time=%u"},
+	{Opt_max_batch_time, "max_batch_time=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
+	{Opt_journal_path, "journal_path=%s"},
 	{Opt_journal_checksum, "journal_checksum"},
 	{Opt_journal_async_commit, "journal_async_commit"},
 	{Opt_abort, "abort"},
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
 	{Opt_grpjquota, "grpjquota=%s"},
 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+	{Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
 	{Opt_grpquota, "grpquota"},
 	{Opt_noquota, "noquota"},
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
-	{Opt_extents, "extents"},
-	{Opt_noextents, "noextents"},
+	{Opt_barrier, "barrier"},
+	{Opt_nobarrier, "nobarrier"},
 	{Opt_i_version, "i_version"},
-	{Opt_mballoc, "mballoc"},
-	{Opt_nomballoc, "nomballoc"},
 	{Opt_stripe, "stripe=%u"},
+	{Opt_delalloc, "delalloc"},
+	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_removed, "mblk_io_submit"},
+	{Opt_removed, "nomblk_io_submit"},
+	{Opt_block_validity, "block_validity"},
+	{Opt_noblock_validity, "noblock_validity"},
+	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+	{Opt_journal_ioprio, "journal_ioprio=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
+	{Opt_auto_da_alloc, "auto_da_alloc"},
+	{Opt_noauto_da_alloc, "noauto_da_alloc"},
+	{Opt_dioread_nolock, "dioread_nolock"},
+	{Opt_dioread_lock, "dioread_lock"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_init_itable, "init_itable=%u"},
+	{Opt_init_itable, "init_itable"},
+	{Opt_noinit_itable, "noinit_itable"},
+	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
+	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
+	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
+	{Opt_removed, "noreservation"}, /* mount option from ext2/3 */
+	{Opt_removed, "journal=%u"},	/* mount option from ext2/3 */
 	{Opt_err, NULL},
-	{Opt_resize, "resize"},
 };
 
 static ext4_fsblk_t get_sb_block(void **data)
@@ -955,400 +1259,633 @@ static ext4_fsblk_t get_sb_block(void **data)
 
 	if (!options || strncmp(options, "sb=", 3) != 0)
 		return 1;	/* Default location */
+
 	options += 3;
-	/*todo: use simple_strtoll with >32bit ext4 */
+	/* TODO: use simple_strtoll with >32bit ext4 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
-		printk("EXT4-fs: Invalid sb specification: %s\n",
+		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
 		       (char *) *data);
 		return 1;
 	}
 	if (*options == ',')
 		options++;
 	*data = (void *) options;
+
 	return sb_block;
 }
 
-static int parse_options (char *options, struct super_block *sb,
-			  unsigned int *inum, unsigned long *journal_devnum,
-			  ext4_fsblk_t *n_blocks_count, int is_remount)
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n"
+	"Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
+
+#ifdef CONFIG_QUOTA
+static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	char * p;
-	substring_t args[MAX_OPT_ARGS];
-	int data_opt = 0;
-	int option;
-#ifdef CONFIG_QUOTA
-	int qtype;
 	char *qname;
-#endif
+	int ret = -1;
+
+	if (sb_any_quota_loaded(sb) &&
+		!sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR,
+			"Cannot change journaled "
+			"quota options when quota turned on");
+		return -1;
+	}
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+		ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
+			 "when QUOTA feature is enabled");
+		return -1;
+	}
+	qname = match_strdup(args);
+	if (!qname) {
+		ext4_msg(sb, KERN_ERR,
+			"Not enough memory for storing quotafile name");
+		return -1;
+	}
+	if (sbi->s_qf_names[qtype]) {
+		if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+			ret = 1;
+		else
+			ext4_msg(sb, KERN_ERR,
+				 "%s quota file already specified",
+				 QTYPE2NAME(qtype));
+		goto errout;
+	}
+	if (strchr(qname, '/')) {
+		ext4_msg(sb, KERN_ERR,
+			"quotafile must be on filesystem root");
+		goto errout;
+	}
+	sbi->s_qf_names[qtype] = qname;
+	set_opt(sb, QUOTA);
+	return 1;
+errout:
+	kfree(qname);
+	return ret;
+}
 
-	if (!options)
-		return 1;
+static int clear_qf_name(struct super_block *sb, int qtype)
+{
 
-	while ((p = strsep (&options, ",")) != NULL) {
-		int token;
-		if (!*p)
-			continue;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_bsd_df:
-			clear_opt (sbi->s_mount_opt, MINIX_DF);
-			break;
-		case Opt_minix_df:
-			set_opt (sbi->s_mount_opt, MINIX_DF);
-			break;
-		case Opt_grpid:
-			set_opt (sbi->s_mount_opt, GRPID);
-			break;
-		case Opt_nogrpid:
-			clear_opt (sbi->s_mount_opt, GRPID);
-			break;
-		case Opt_resuid:
-			if (match_int(&args[0], &option))
-				return 0;
-			sbi->s_resuid = option;
-			break;
-		case Opt_resgid:
-			if (match_int(&args[0], &option))
-				return 0;
-			sbi->s_resgid = option;
-			break;
-		case Opt_sb:
-			/* handled by get_sb_block() instead of here */
-			/* *sb_block = match_int(&args[0]); */
-			break;
-		case Opt_err_panic:
-			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
-			clear_opt (sbi->s_mount_opt, ERRORS_RO);
-			set_opt (sbi->s_mount_opt, ERRORS_PANIC);
-			break;
-		case Opt_err_ro:
-			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
-			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
-			set_opt (sbi->s_mount_opt, ERRORS_RO);
-			break;
-		case Opt_err_cont:
-			clear_opt (sbi->s_mount_opt, ERRORS_RO);
-			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
-			set_opt (sbi->s_mount_opt, ERRORS_CONT);
-			break;
-		case Opt_nouid32:
-			set_opt (sbi->s_mount_opt, NO_UID32);
-			break;
-		case Opt_nocheck:
-			clear_opt (sbi->s_mount_opt, CHECK);
-			break;
-		case Opt_debug:
-			set_opt (sbi->s_mount_opt, DEBUG);
-			break;
-		case Opt_oldalloc:
-			set_opt (sbi->s_mount_opt, OLDALLOC);
-			break;
-		case Opt_orlov:
-			clear_opt (sbi->s_mount_opt, OLDALLOC);
-			break;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
-		case Opt_user_xattr:
-			set_opt (sbi->s_mount_opt, XATTR_USER);
-			break;
-		case Opt_nouser_xattr:
-			clear_opt (sbi->s_mount_opt, XATTR_USER);
-			break;
+	if (sb_any_quota_loaded(sb) &&
+		sbi->s_qf_names[qtype]) {
+		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+			" when quota turned on");
+		return -1;
+	}
+	kfree(sbi->s_qf_names[qtype]);
+	sbi->s_qf_names[qtype] = NULL;
+	return 1;
+}
+#endif
+
+#define MOPT_SET	0x0001
+#define MOPT_CLEAR	0x0002
+#define MOPT_NOSUPPORT	0x0004
+#define MOPT_EXPLICIT	0x0008
+#define MOPT_CLEAR_ERR	0x0010
+#define MOPT_GTE0	0x0020
+#ifdef CONFIG_QUOTA
+#define MOPT_Q		0
+#define MOPT_QFMT	0x0040
 #else
-		case Opt_user_xattr:
-		case Opt_nouser_xattr:
-			printk("EXT4 (no)user_xattr options not supported\n");
-			break;
+#define MOPT_Q		MOPT_NOSUPPORT
+#define MOPT_QFMT	MOPT_NOSUPPORT
 #endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-		case Opt_acl:
-			set_opt(sbi->s_mount_opt, POSIX_ACL);
-			break;
-		case Opt_noacl:
-			clear_opt(sbi->s_mount_opt, POSIX_ACL);
-			break;
+#define MOPT_DATAJ	0x0080
+#define MOPT_NO_EXT2	0x0100
+#define MOPT_NO_EXT3	0x0200
+#define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
+#define MOPT_STRING	0x0400
+
+static const struct mount_opts {
+	int	token;
+	int	mount_opt;
+	int	flags;
+} ext4_mount_opts[] = {
+	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
+	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
+	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
+	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
+	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
+	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
+	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
+	 MOPT_EXT4_ONLY | MOPT_SET},
+	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
+	 MOPT_EXT4_ONLY | MOPT_CLEAR},
+	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
+	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
+	{Opt_delalloc, EXT4_MOUNT_DELALLOC,
+	 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
+	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
+	 MOPT_EXT4_ONLY | MOPT_CLEAR},
+	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
+	 MOPT_EXT4_ONLY | MOPT_SET},
+	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
+				    EXT4_MOUNT_JOURNAL_CHECKSUM),
+	 MOPT_EXT4_ONLY | MOPT_SET},
+	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
+	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
+	 MOPT_NO_EXT2 | MOPT_SET},
+	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
+	 MOPT_NO_EXT2 | MOPT_CLEAR},
+	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
+	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
+	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
+	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
+	{Opt_commit, 0, MOPT_GTE0},
+	{Opt_max_batch_time, 0, MOPT_GTE0},
+	{Opt_min_batch_time, 0, MOPT_GTE0},
+	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
+	{Opt_init_itable, 0, MOPT_GTE0},
+	{Opt_stripe, 0, MOPT_GTE0},
+	{Opt_resuid, 0, MOPT_GTE0},
+	{Opt_resgid, 0, MOPT_GTE0},
+	{Opt_journal_dev, 0, MOPT_GTE0},
+	{Opt_journal_path, 0, MOPT_STRING},
+	{Opt_journal_ioprio, 0, MOPT_GTE0},
+	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
+	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
+	 MOPT_NO_EXT2 | MOPT_DATAJ},
+	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
+	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
+	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
 #else
-		case Opt_acl:
-		case Opt_noacl:
-			printk("EXT4 (no)acl options not supported\n");
-			break;
+	{Opt_acl, 0, MOPT_NOSUPPORT},
+	{Opt_noacl, 0, MOPT_NOSUPPORT},
 #endif
-		case Opt_reservation:
-			set_opt(sbi->s_mount_opt, RESERVATION);
-			break;
-		case Opt_noreservation:
-			clear_opt(sbi->s_mount_opt, RESERVATION);
-			break;
-		case Opt_journal_update:
-			/* @@@ FIXME */
-			/* Eventually we will want to be able to create
-			   a journal file here.  For now, only allow the
-			   user to specify an existing inode to be the
-			   journal file. */
-			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
-				return 0;
-			}
-			set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
-			break;
-		case Opt_journal_inum:
-			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
-				return 0;
-			}
-			if (match_int(&args[0], &option))
-				return 0;
-			*inum = option;
-			break;
-		case Opt_journal_dev:
-			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
-				return 0;
-			}
-			if (match_int(&args[0], &option))
-				return 0;
-			*journal_devnum = option;
-			break;
-		case Opt_journal_checksum:
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
-			break;
-		case Opt_journal_async_commit:
-			set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
-			set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
-			break;
-		case Opt_noload:
-			set_opt (sbi->s_mount_opt, NOLOAD);
-			break;
-		case Opt_commit:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			if (option == 0)
-				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
-			sbi->s_commit_interval = HZ * option;
-			break;
-		case Opt_data_journal:
-			data_opt = EXT4_MOUNT_JOURNAL_DATA;
-			goto datacheck;
-		case Opt_data_ordered:
-			data_opt = EXT4_MOUNT_ORDERED_DATA;
-			goto datacheck;
-		case Opt_data_writeback:
-			data_opt = EXT4_MOUNT_WRITEBACK_DATA;
-		datacheck:
-			if (is_remount) {
-				if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
-						!= data_opt) {
-					printk(KERN_ERR
-						"EXT4-fs: cannot change data "
-						"mode on remount\n");
-					return 0;
-				}
-			} else {
-				sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
-				sbi->s_mount_opt |= data_opt;
-			}
-			break;
+	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
+	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
+	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
+	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
+							MOPT_SET | MOPT_Q},
+	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
+							MOPT_SET | MOPT_Q},
+	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+		       EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
+	{Opt_usrjquota, 0, MOPT_Q},
+	{Opt_grpjquota, 0, MOPT_Q},
+	{Opt_offusrjquota, 0, MOPT_Q},
+	{Opt_offgrpjquota, 0, MOPT_Q},
+	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
+	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+	{Opt_err, 0, 0}
+};
+
+static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+			    substring_t *args, unsigned long *journal_devnum,
+			    unsigned int *journal_ioprio, int is_remount)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	const struct mount_opts *m;
+	kuid_t uid;
+	kgid_t gid;
+	int arg = 0;
+
 #ifdef CONFIG_QUOTA
-		case Opt_usrjquota:
-			qtype = USRQUOTA;
-			goto set_qf_name;
-		case Opt_grpjquota:
-			qtype = GRPQUOTA;
-set_qf_name:
-			if (sb_any_quota_enabled(sb)) {
-				printk(KERN_ERR
-					"EXT4-fs: Cannot change journalled "
-					"quota options when quota turned on.\n");
-				return 0;
-			}
-			qname = match_strdup(&args[0]);
-			if (!qname) {
-				printk(KERN_ERR
-					"EXT4-fs: not enough memory for "
-					"storing quotafile name.\n");
-				return 0;
-			}
-			if (sbi->s_qf_names[qtype] &&
-			    strcmp(sbi->s_qf_names[qtype], qname)) {
-				printk(KERN_ERR
-					"EXT4-fs: %s quota file already "
-					"specified.\n", QTYPE2NAME(qtype));
-				kfree(qname);
-				return 0;
-			}
-			sbi->s_qf_names[qtype] = qname;
-			if (strchr(sbi->s_qf_names[qtype], '/')) {
-				printk(KERN_ERR
-					"EXT4-fs: quotafile must be on "
-					"filesystem root.\n");
-				kfree(sbi->s_qf_names[qtype]);
-				sbi->s_qf_names[qtype] = NULL;
-				return 0;
-			}
-			set_opt(sbi->s_mount_opt, QUOTA);
-			break;
-		case Opt_offusrjquota:
-			qtype = USRQUOTA;
-			goto clear_qf_name;
-		case Opt_offgrpjquota:
-			qtype = GRPQUOTA;
-clear_qf_name:
-			if (sb_any_quota_enabled(sb)) {
-				printk(KERN_ERR "EXT4-fs: Cannot change "
-					"journalled quota options when "
-					"quota turned on.\n");
-				return 0;
-			}
-			/*
-			 * The space will be released later when all options
-			 * are confirmed to be correct
-			 */
-			sbi->s_qf_names[qtype] = NULL;
-			break;
-		case Opt_jqfmt_vfsold:
-			sbi->s_jquota_fmt = QFMT_VFS_OLD;
-			break;
-		case Opt_jqfmt_vfsv0:
-			sbi->s_jquota_fmt = QFMT_VFS_V0;
-			break;
-		case Opt_quota:
-		case Opt_usrquota:
-			set_opt(sbi->s_mount_opt, QUOTA);
-			set_opt(sbi->s_mount_opt, USRQUOTA);
-			break;
-		case Opt_grpquota:
-			set_opt(sbi->s_mount_opt, QUOTA);
-			set_opt(sbi->s_mount_opt, GRPQUOTA);
-			break;
-		case Opt_noquota:
-			if (sb_any_quota_enabled(sb)) {
-				printk(KERN_ERR "EXT4-fs: Cannot change quota "
-					"options when quota turned on.\n");
-				return 0;
-			}
-			clear_opt(sbi->s_mount_opt, QUOTA);
-			clear_opt(sbi->s_mount_opt, USRQUOTA);
-			clear_opt(sbi->s_mount_opt, GRPQUOTA);
-			break;
-#else
-		case Opt_quota:
-		case Opt_usrquota:
-		case Opt_grpquota:
-		case Opt_usrjquota:
-		case Opt_grpjquota:
-		case Opt_offusrjquota:
-		case Opt_offgrpjquota:
-		case Opt_jqfmt_vfsold:
-		case Opt_jqfmt_vfsv0:
-			printk(KERN_ERR
-				"EXT4-fs: journalled quota options not "
-				"supported.\n");
-			break;
-		case Opt_noquota:
-			break;
+	if (token == Opt_usrjquota)
+		return set_qf_name(sb, USRQUOTA, &args[0]);
+	else if (token == Opt_grpjquota)
+		return set_qf_name(sb, GRPQUOTA, &args[0]);
+	else if (token == Opt_offusrjquota)
+		return clear_qf_name(sb, USRQUOTA);
+	else if (token == Opt_offgrpjquota)
+		return clear_qf_name(sb, GRPQUOTA);
 #endif
-		case Opt_abort:
-			set_opt(sbi->s_mount_opt, ABORT);
-			break;
-		case Opt_barrier:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option)
-				set_opt(sbi->s_mount_opt, BARRIER);
-			else
-				clear_opt(sbi->s_mount_opt, BARRIER);
-			break;
-		case Opt_ignore:
+	switch (token) {
+	case Opt_noacl:
+	case Opt_nouser_xattr:
+		ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
+		break;
+	case Opt_sb:
+		return 1;	/* handled by get_sb_block() */
+	case Opt_removed:
+		ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
+		return 1;
+	case Opt_abort:
+		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		return 1;
+	case Opt_i_version:
+		sb->s_flags |= MS_I_VERSION;
+		return 1;
+	}
+
+	for (m = ext4_mount_opts; m->token != Opt_err; m++)
+		if (token == m->token)
 			break;
-		case Opt_resize:
-			if (!is_remount) {
-				printk("EXT4-fs: resize option only available "
-					"for remount\n");
-				return 0;
+
+	if (m->token == Opt_err) {
+		ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+			 "or missing value", opt);
+		return -1;
+	}
+
+	if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Mount option \"%s\" incompatible with ext2", opt);
+		return -1;
+	}
+	if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Mount option \"%s\" incompatible with ext3", opt);
+		return -1;
+	}
+
+	if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
+		return -1;
+	if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+		return -1;
+	if (m->flags & MOPT_EXPLICIT)
+		set_opt2(sb, EXPLICIT_DELALLOC);
+	if (m->flags & MOPT_CLEAR_ERR)
+		clear_opt(sb, ERRORS_MASK);
+	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+		ext4_msg(sb, KERN_ERR, "Cannot change quota "
+			 "options when quota turned on");
+		return -1;
+	}
+
+	if (m->flags & MOPT_NOSUPPORT) {
+		ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+	} else if (token == Opt_commit) {
+		if (arg == 0)
+			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+		sbi->s_commit_interval = HZ * arg;
+	} else if (token == Opt_max_batch_time) {
+		sbi->s_max_batch_time = arg;
+	} else if (token == Opt_min_batch_time) {
+		sbi->s_min_batch_time = arg;
+	} else if (token == Opt_inode_readahead_blks) {
+		if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
+			ext4_msg(sb, KERN_ERR,
+				 "EXT4-fs: inode_readahead_blks must be "
+				 "0 or a power of 2 smaller than 2^31");
+			return -1;
+		}
+		sbi->s_inode_readahead_blks = arg;
+	} else if (token == Opt_init_itable) {
+		set_opt(sb, INIT_INODE_TABLE);
+		if (!args->from)
+			arg = EXT4_DEF_LI_WAIT_MULT;
+		sbi->s_li_wait_mult = arg;
+	} else if (token == Opt_max_dir_size_kb) {
+		sbi->s_max_dir_size_kb = arg;
+	} else if (token == Opt_stripe) {
+		sbi->s_stripe = arg;
+	} else if (token == Opt_resuid) {
+		uid = make_kuid(current_user_ns(), arg);
+		if (!uid_valid(uid)) {
+			ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
+			return -1;
+		}
+		sbi->s_resuid = uid;
+	} else if (token == Opt_resgid) {
+		gid = make_kgid(current_user_ns(), arg);
+		if (!gid_valid(gid)) {
+			ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
+			return -1;
+		}
+		sbi->s_resgid = gid;
+	} else if (token == Opt_journal_dev) {
+		if (is_remount) {
+			ext4_msg(sb, KERN_ERR,
+				 "Cannot specify journal on remount");
+			return -1;
+		}
+		*journal_devnum = arg;
+	} else if (token == Opt_journal_path) {
+		char *journal_path;
+		struct inode *journal_inode;
+		struct path path;
+		int error;
+
+		if (is_remount) {
+			ext4_msg(sb, KERN_ERR,
+				 "Cannot specify journal on remount");
+			return -1;
+		}
+		journal_path = match_strdup(&args[0]);
+		if (!journal_path) {
+			ext4_msg(sb, KERN_ERR, "error: could not dup "
+				"journal device string");
+			return -1;
+		}
+
+		error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
+		if (error) {
+			ext4_msg(sb, KERN_ERR, "error: could not find "
+				"journal device path: error %d", error);
+			kfree(journal_path);
+			return -1;
+		}
+
+		journal_inode = path.dentry->d_inode;
+		if (!S_ISBLK(journal_inode->i_mode)) {
+			ext4_msg(sb, KERN_ERR, "error: journal path %s "
+				"is not a block device", journal_path);
+			path_put(&path);
+			kfree(journal_path);
+			return -1;
+		}
+
+		*journal_devnum = new_encode_dev(journal_inode->i_rdev);
+		path_put(&path);
+		kfree(journal_path);
+	} else if (token == Opt_journal_ioprio) {
+		if (arg > 7) {
+			ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
+				 " (must be 0-7)");
+			return -1;
+		}
+		*journal_ioprio =
+			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+	} else if (m->flags & MOPT_DATAJ) {
+		if (is_remount) {
+			if (!sbi->s_journal)
+				ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+			else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
+				ext4_msg(sb, KERN_ERR,
+					 "Cannot change data mode on remount");
+				return -1;
 			}
-			if (match_int(&args[0], &option) != 0)
-				return 0;
-			*n_blocks_count = option;
-			break;
-		case Opt_nobh:
-			set_opt(sbi->s_mount_opt, NOBH);
-			break;
-		case Opt_bh:
-			clear_opt(sbi->s_mount_opt, NOBH);
-			break;
-		case Opt_extents:
-			set_opt (sbi->s_mount_opt, EXTENTS);
-			break;
-		case Opt_noextents:
-			clear_opt (sbi->s_mount_opt, EXTENTS);
-			break;
-		case Opt_i_version:
-			set_opt(sbi->s_mount_opt, I_VERSION);
-			sb->s_flags |= MS_I_VERSION;
-			break;
-		case Opt_mballoc:
-			set_opt(sbi->s_mount_opt, MBALLOC);
-			break;
-		case Opt_nomballoc:
-			clear_opt(sbi->s_mount_opt, MBALLOC);
-			break;
-		case Opt_stripe:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			sbi->s_stripe = option;
-			break;
-		default:
-			printk (KERN_ERR
-				"EXT4-fs: Unrecognized mount option \"%s\" "
-				"or missing value\n", p);
-			return 0;
+		} else {
+			clear_opt(sb, DATA_FLAGS);
+			sbi->s_mount_opt |= m->mount_opt;
+		}
+#ifdef CONFIG_QUOTA
+	} else if (m->flags & MOPT_QFMT) {
+		if (sb_any_quota_loaded(sb) &&
+		    sbi->s_jquota_fmt != m->mount_opt) {
+			ext4_msg(sb, KERN_ERR, "Cannot change journaled "
+				 "quota options when quota turned on");
+			return -1;
+		}
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					       EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Cannot set journaled quota options "
+				 "when QUOTA feature is enabled");
+			return -1;
+		}
+		sbi->s_jquota_fmt = m->mount_opt;
+#endif
+	} else {
+		if (!args->from)
+			arg = 1;
+		if (m->flags & MOPT_CLEAR)
+			arg = !arg;
+		else if (unlikely(!(m->flags & MOPT_SET))) {
+			ext4_msg(sb, KERN_WARNING,
+				 "buggy handling of option %s", opt);
+			WARN_ON(1);
+			return -1;
 		}
+		if (arg != 0)
+			sbi->s_mount_opt |= m->mount_opt;
+		else
+			sbi->s_mount_opt &= ~m->mount_opt;
+	}
+	return 1;
+}
+
+static int parse_options(char *options, struct super_block *sb,
+			 unsigned long *journal_devnum,
+			 unsigned int *journal_ioprio,
+			 int is_remount)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+		/*
+		 * Initialize args struct so we know whether arg was
+		 * found; some options take optional arguments.
+		 */
+		args[0].to = args[0].from = NULL;
+		token = match_token(p, tokens, args);
+		if (handle_mount_opt(sb, p, token, args, journal_devnum,
+				     journal_ioprio, is_remount) < 0)
+			return 0;
 	}
 #ifdef CONFIG_QUOTA
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+	    (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
+		ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
+			 "feature is enabled");
+		return 0;
+	}
 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
-		if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
-		     sbi->s_qf_names[USRQUOTA])
-			clear_opt(sbi->s_mount_opt, USRQUOTA);
-
-		if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
-		     sbi->s_qf_names[GRPQUOTA])
-			clear_opt(sbi->s_mount_opt, GRPQUOTA);
-
-		if ((sbi->s_qf_names[USRQUOTA] &&
-				(sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
-		    (sbi->s_qf_names[GRPQUOTA] &&
-				(sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
-			printk(KERN_ERR "EXT4-fs: old and new quota "
-					"format mixing.\n");
+		if (test_opt(sb, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+			clear_opt(sb, USRQUOTA);
+
+		if (test_opt(sb, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+			clear_opt(sb, GRPQUOTA);
+
+		if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
+			ext4_msg(sb, KERN_ERR, "old and new quota "
+					"format mixing");
 			return 0;
 		}
 
 		if (!sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journalled quota format "
-					"not specified.\n");
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
+					"not specified");
 			return 0;
 		}
 	} else {
 		if (sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journalled quota format "
-					"specified with no journalling "
-					"enabled.\n");
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
+					"specified with no journaling "
+					"enabled");
 			return 0;
 		}
 	}
 #endif
+	if (test_opt(sb, DIOREAD_NOLOCK)) {
+		int blocksize =
+			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
+
+		if (blocksize < PAGE_CACHE_SIZE) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "dioread_nolock if block size != PAGE_SIZE");
+			return 0;
+		}
+	}
 	return 1;
 }
 
+static inline void ext4_show_quota_options(struct seq_file *seq,
+					   struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sbi->s_jquota_fmt) {
+		char *fmtname = "";
+
+		switch (sbi->s_jquota_fmt) {
+		case QFMT_VFS_OLD:
+			fmtname = "vfsold";
+			break;
+		case QFMT_VFS_V0:
+			fmtname = "vfsv0";
+			break;
+		case QFMT_VFS_V1:
+			fmtname = "vfsv1";
+			break;
+		}
+		seq_printf(seq, ",jqfmt=%s", fmtname);
+	}
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+#endif
+}
+
+static const char *token2str(int token)
+{
+	const struct match_token *t;
+
+	for (t = tokens; t->token != Opt_err; t++)
+		if (t->token == token && !strchr(t->pattern, '='))
+			break;
+	return t->pattern;
+}
+
+/*
+ * Show an option if
+ *  - it's set to a non-default value OR
+ *  - if the per-sb default is different from the global default
+ */
+static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
+			      int nodefs)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+	const struct mount_opts *m;
+	char sep = nodefs ? '\n' : ',';
+
+#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
+#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
+
+	if (sbi->s_sb_block != 1)
+		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
+
+	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+		int want_set = m->flags & MOPT_SET;
+		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
+		    (m->flags & MOPT_CLEAR_ERR))
+			continue;
+		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+			continue; /* skip if same as the default */
+		if ((want_set &&
+		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+			continue; /* select Opt_noFoo vs Opt_Foo */
+		SEQ_OPTS_PRINT("%s", token2str(m->token));
+	}
+
+	if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
+	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+		SEQ_OPTS_PRINT("resuid=%u",
+				from_kuid_munged(&init_user_ns, sbi->s_resuid));
+	if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
+	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+		SEQ_OPTS_PRINT("resgid=%u",
+				from_kgid_munged(&init_user_ns, sbi->s_resgid));
+	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
+	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
+		SEQ_OPTS_PUTS("errors=remount-ro");
+	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+		SEQ_OPTS_PUTS("errors=continue");
+	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+		SEQ_OPTS_PUTS("errors=panic");
+	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
+	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
+		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
+	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
+		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+	if (sb->s_flags & MS_I_VERSION)
+		SEQ_OPTS_PUTS("i_version");
+	if (nodefs || sbi->s_stripe)
+		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
+	if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+			SEQ_OPTS_PUTS("data=journal");
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+			SEQ_OPTS_PUTS("data=ordered");
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+			SEQ_OPTS_PUTS("data=writeback");
+	}
+	if (nodefs ||
+	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
+			       sbi->s_inode_readahead_blks);
+
+	if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
+		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+	if (nodefs || sbi->s_max_dir_size_kb)
+		SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
+
+	ext4_show_quota_options(seq, sb);
+	return 0;
+}
+
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
+{
+	return _ext4_show_options(seq, root->d_sb, 0);
+}
+
+static int options_seq_show(struct seq_file *seq, void *offset)
+{
+	struct super_block *sb = seq->private;
+	int rc;
+
+	seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
+	rc = _ext4_show_options(seq, sb, 1);
+	seq_puts(seq, "\n");
+	return rc;
+}
+
+static int options_open_fs(struct inode *inode, struct file *file)
+{
+	return single_open(file, options_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations ext4_seq_options_fops = {
+	.owner = THIS_MODULE,
+	.open = options_open_fs,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			    int read_only)
 {
@@ -1356,107 +1893,188 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	int res = 0;
 
 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-		printk (KERN_ERR "EXT4-fs warning: revision level too high, "
-			"forcing read-only mode\n");
+		ext4_msg(sb, KERN_ERR, "revision level too high, "
+			 "forcing read-only mode");
 		res = MS_RDONLY;
 	}
 	if (read_only)
-		return res;
+		goto done;
 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
-		printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
-			"running e2fsck is recommended\n");
-	else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-		printk (KERN_WARNING
-			"EXT4-fs warning: mounting fs with errors, "
-			"running e2fsck is recommended\n");
-	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+			 "running e2fsck is recommended");
+	else if (sbi->s_mount_state & EXT4_ERROR_FS)
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: mounting fs with errors, "
+			 "running e2fsck is recommended");
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-		printk (KERN_WARNING
-			"EXT4-fs warning: maximal mount count reached, "
-			"running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: maximal mount count reached, "
+			 "running e2fsck is recommended");
 	else if (le32_to_cpu(es->s_checkinterval) &&
 		(le32_to_cpu(es->s_lastcheck) +
 			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-		printk (KERN_WARNING
-			"EXT4-fs warning: checktime reached, "
-			"running e2fsck is recommended\n");
-#if 0
-		/* @@@ We _will_ want to clear the valid bit if we find
-		 * inconsistencies, to force a fsck at reboot.  But for
-		 * a plain journaled filesystem we can keep it set as
-		 * valid forever! :)
-		 */
-	es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS);
-#endif
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: checktime reached, "
+			 "running e2fsck is recommended");
+	if (!sbi->s_journal)
+		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
-	es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+	le16_add_cpu(&es->s_mnt_count, 1);
 	es->s_mtime = cpu_to_le32(get_seconds());
 	ext4_update_dynamic_rev(sb);
-	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	if (sbi->s_journal)
+		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
-	ext4_commit_super(sb, es, 1);
+	ext4_commit_super(sb, 1);
+done:
 	if (test_opt(sb, DEBUG))
-		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
-				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
+		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
+				"bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
 			sb->s_blocksize,
 			sbi->s_groups_count,
 			EXT4_BLOCKS_PER_GROUP(sb),
 			EXT4_INODES_PER_GROUP(sb),
-			sbi->s_mount_opt);
+			sbi->s_mount_opt, sbi->s_mount_opt2);
+
+	cleancache_init_fs(sb);
+	return res;
+}
 
-	printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
-	if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
-		char b[BDEVNAME_SIZE];
+int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct flex_groups *new_groups;
+	int size;
 
-		printk("external journal on %s\n",
-			bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
-	} else {
-		printk("internal journal\n");
+	if (!sbi->s_log_groups_per_flex)
+		return 0;
+
+	size = ext4_flex_group(sbi, ngroup - 1) + 1;
+	if (size <= sbi->s_flex_groups_allocated)
+		return 0;
+
+	size = roundup_pow_of_two(size * sizeof(struct flex_groups));
+	new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+	if (!new_groups) {
+		ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
+			 size / (int) sizeof(struct flex_groups));
+		return -ENOMEM;
 	}
-	return res;
+
+	if (sbi->s_flex_groups) {
+		memcpy(new_groups, sbi->s_flex_groups,
+		       (sbi->s_flex_groups_allocated *
+			sizeof(struct flex_groups)));
+		ext4_kvfree(sbi->s_flex_groups);
+	}
+	sbi->s_flex_groups = new_groups;
+	sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+	return 0;
 }
 
-__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
-			    struct ext4_group_desc *gdp)
+static int ext4_fill_flex_info(struct super_block *sb)
 {
-	__u16 crc = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	ext4_group_t flex_group;
+	int i, err;
+
+	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+	if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
+		sbi->s_log_groups_per_flex = 0;
+		return 1;
+	}
 
-	if (sbi->s_es->s_feature_ro_compat &
-	    cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-		int offset = offsetof(struct ext4_group_desc, bg_checksum);
-		__le32 le_group = cpu_to_le32(block_group);
+	err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
+	if (err)
+		goto failed;
 
-		crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
-		crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
-		crc = crc16(crc, (__u8 *)gdp, offset);
-		offset += sizeof(gdp->bg_checksum); /* skip checksum */
-		/* for checksum of struct ext4_group_desc do the rest...*/
-		if ((sbi->s_es->s_feature_incompat &
-		     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
-		    offset < le16_to_cpu(sbi->s_es->s_desc_size))
-			crc = crc16(crc, (__u8 *)gdp + offset,
-				    le16_to_cpu(sbi->s_es->s_desc_size) -
-					offset);
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+
+		flex_group = ext4_flex_group(sbi, i);
+		atomic_add(ext4_free_inodes_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].free_inodes);
+		atomic64_add(ext4_free_group_clusters(sb, gdp),
+			     &sbi->s_flex_groups[flex_group].free_clusters);
+		atomic_add(ext4_used_dirs_count(sb, gdp),
+			   &sbi->s_flex_groups[flex_group].used_dirs);
 	}
 
+	return 1;
+failed:
+	return 0;
+}
+
+static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+				   struct ext4_group_desc *gdp)
+{
+	int offset;
+	__u16 crc = 0;
+	__le32 le_group = cpu_to_le32(block_group);
+
+	if ((sbi->s_es->s_feature_ro_compat &
+	     cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
+		/* Use new metadata_csum algorithm */
+		__le16 save_csum;
+		__u32 csum32;
+
+		save_csum = gdp->bg_checksum;
+		gdp->bg_checksum = 0;
+		csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+				     sizeof(le_group));
+		csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
+				     sbi->s_desc_size);
+		gdp->bg_checksum = save_csum;
+
+		crc = csum32 & 0xFFFF;
+		goto out;
+	}
+
+	/* old crc16 code */
+	offset = offsetof(struct ext4_group_desc, bg_checksum);
+
+	crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+	crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+	crc = crc16(crc, (__u8 *)gdp, offset);
+	offset += sizeof(gdp->bg_checksum); /* skip checksum */
+	/* for checksum of struct ext4_group_desc do the rest...*/
+	if ((sbi->s_es->s_feature_incompat &
+	     cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+	    offset < le16_to_cpu(sbi->s_es->s_desc_size))
+		crc = crc16(crc, (__u8 *)gdp + offset,
+			    le16_to_cpu(sbi->s_es->s_desc_size) -
+				offset);
+
+out:
 	return cpu_to_le16(crc);
 }
 
-int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
+int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
 				struct ext4_group_desc *gdp)
 {
-	if ((sbi->s_es->s_feature_ro_compat &
-	     cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
-	    (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
+	if (ext4_has_group_desc_csum(sb) &&
+	    (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
+						      block_group, gdp)))
 		return 0;
 
 	return 1;
 }
 
+void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
+			      struct ext4_group_desc *gdp)
+{
+	if (!ext4_has_group_desc_csum(sb))
+		return;
+	gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+}
+
 /* Called at mount-time, super-block is locked */
-static int ext4_check_descriptors(struct super_block *sb)
+static int ext4_check_descriptors(struct super_block *sb,
+				  ext4_group_t *first_not_zeroed)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -1465,12 +2083,12 @@ static int ext4_check_descriptors(struct super_block *sb)
 	ext4_fsblk_t inode_bitmap;
 	ext4_fsblk_t inode_table;
 	int flexbg_flag = 0;
-	ext4_group_t i;
+	ext4_group_t i, grp = sbi->s_groups_count;
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		flexbg_flag = 1;
 
-	ext4_debug ("Checking group descriptors");
+	ext4_debug("Checking group descriptors");
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1481,47 +2099,53 @@ static int ext4_check_descriptors(struct super_block *sb)
 			last_block = first_block +
 				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
 
+		if ((grp == sbi->s_groups_count) &&
+		   !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			grp = i;
+
 		block_bitmap = ext4_block_bitmap(sb, gdp);
-		if (block_bitmap < first_block || block_bitmap > last_block)
-		{
-			ext4_error (sb, "ext4_check_descriptors",
-				    "Block bitmap for group %lu"
-				    " not in group (block %llu)!",
-				    i, block_bitmap);
+		if (block_bitmap < first_block || block_bitmap > last_block) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+			       "Block bitmap for group %u not in group "
+			       "(block %llu)!", i, block_bitmap);
 			return 0;
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
-		if (inode_bitmap < first_block || inode_bitmap > last_block)
-		{
-			ext4_error (sb, "ext4_check_descriptors",
-				    "Inode bitmap for group %lu"
-				    " not in group (block %llu)!",
-				    i, inode_bitmap);
+		if (inode_bitmap < first_block || inode_bitmap > last_block) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+			       "Inode bitmap for group %u not in group "
+			       "(block %llu)!", i, inode_bitmap);
 			return 0;
 		}
 		inode_table = ext4_inode_table(sb, gdp);
 		if (inode_table < first_block ||
-		    inode_table + sbi->s_itb_per_group - 1 > last_block)
-		{
-			ext4_error (sb, "ext4_check_descriptors",
-				    "Inode table for group %lu"
-				    " not in group (block %llu)!",
-				    i, inode_table);
+		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+			       "Inode table for group %u not in group "
+			       "(block %llu)!", i, inode_table);
 			return 0;
 		}
-		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
-			ext4_error(sb, __FUNCTION__,
-				   "Checksum for group %lu failed (%u!=%u)\n",
-				    i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-				    gdp)), le16_to_cpu(gdp->bg_checksum));
-			return 0;
+		ext4_lock_group(sb, i);
+		if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+				 "Checksum for group %u failed (%u!=%u)",
+				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+				     gdp)), le16_to_cpu(gdp->bg_checksum));
+			if (!(sb->s_flags & MS_RDONLY)) {
+				ext4_unlock_group(sb, i);
+				return 0;
+			}
 		}
+		ext4_unlock_group(sb, i);
 		if (!flexbg_flag)
 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
 	}
+	if (NULL != first_not_zeroed)
+		*first_not_zeroed = grp;
 
-	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-	sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
+	ext4_free_blocks_count_set(sbi->s_es,
+				   EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
+	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
 	return 1;
 }
 
@@ -1542,8 +2166,8 @@ static int ext4_check_descriptors(struct super_block *sb)
  * e2fsck was run on this filesystem, and it must have already done the orphan
  * inode cleanup for us, so we can safely abort without any further action.
  */
-static void ext4_orphan_cleanup (struct super_block * sb,
-				 struct ext4_super_block * es)
+static void ext4_orphan_cleanup(struct super_block *sb,
+				struct ext4_super_block *es)
 {
 	unsigned int s_flags = sb->s_flags;
 	int nr_orphans = 0, nr_truncates = 0;
@@ -1556,23 +2180,31 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 	}
 
 	if (bdev_read_only(sb->s_bdev)) {
-		printk(KERN_ERR "EXT4-fs: write access "
-			"unavailable, skipping orphan cleanup.\n");
+		ext4_msg(sb, KERN_ERR, "write access "
+			"unavailable, skipping orphan cleanup");
+		return;
+	}
+
+	/* Check if feature set would not allow a r/w mount */
+	if (!ext4_feature_set_ok(sb, 0)) {
+		ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+			 "unknown ROCOMPAT features");
 		return;
 	}
 
 	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
-		if (es->s_last_orphan)
+		/* don't clear list on RO mount w/ errors */
+		if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
 			jbd_debug(1, "Errors on filesystem, "
 				  "clearing orphan list.\n");
-		es->s_last_orphan = 0;
+			es->s_last_orphan = 0;
+		}
 		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
 		return;
 	}
 
 	if (s_flags & MS_RDONLY) {
-		printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
-		       sb->s_id);
+		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
 		sb->s_flags &= ~MS_RDONLY;
 	}
 #ifdef CONFIG_QUOTA
@@ -1583,9 +2215,9 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 		if (EXT4_SB(sb)->s_qf_names[i]) {
 			int ret = ext4_quota_on_mount(sb, i);
 			if (ret < 0)
-				printk(KERN_ERR
-					"EXT4-fs: Cannot turn on journalled "
-					"quota: error %d\n", ret);
+				ext4_msg(sb, KERN_ERR,
+					"Cannot turn on journaled "
+					"quota: error %d", ret);
 		}
 	}
 #endif
@@ -1593,26 +2225,31 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 	while (es->s_last_orphan) {
 		struct inode *inode;
 
-		if (!(inode =
-		      ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+		inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
+		if (IS_ERR(inode)) {
 			es->s_last_orphan = 0;
 			break;
 		}
 
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-		DQUOT_INIT(inode);
+		dquot_initialize(inode);
 		if (inode->i_nlink) {
-			printk(KERN_DEBUG
-				"%s: truncating inode %lu to %Ld bytes\n",
-				__FUNCTION__, inode->i_ino, inode->i_size);
-			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+			if (test_opt(sb, DEBUG))
+				ext4_msg(sb, KERN_DEBUG,
+					"%s: truncating inode %lu to %lld bytes",
+					__func__, inode->i_ino, inode->i_size);
+			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
+			mutex_lock(&inode->i_mutex);
+			truncate_inode_pages(inode->i_mapping, inode->i_size);
 			ext4_truncate(inode);
+			mutex_unlock(&inode->i_mutex);
 			nr_truncates++;
 		} else {
-			printk(KERN_DEBUG
-				"%s: deleting unreferenced inode %lu\n",
-				__FUNCTION__, inode->i_ino);
+			if (test_opt(sb, DEBUG))
+				ext4_msg(sb, KERN_DEBUG,
+					"%s: deleting unreferenced inode %lu",
+					__func__, inode->i_ino);
 			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
 			nr_orphans++;
@@ -1620,23 +2257,24 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 		iput(inode);  /* The delete magic happens here! */
 	}
 
-#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
 
 	if (nr_orphans)
-		printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
-		       sb->s_id, PLURAL(nr_orphans));
+		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+		       PLURAL(nr_orphans));
 	if (nr_truncates)
-		printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
-		       sb->s_id, PLURAL(nr_truncates));
+		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+		       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (sb_dqopt(sb)->files[i])
-			vfs_quota_off(sb, i);
+			dquot_quota_off(sb, i);
 	}
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
+
 /*
  * Maximal extent format file size.
  * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1644,17 +2282,23 @@ static void ext4_orphan_cleanup (struct super_block * sb,
  * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
  * so that won't be a limiting factor.
  *
+ * However there is other limiting factor. We do store extents in the form
+ * of starting block and length, hence the resulting length of the extent
+ * covering maximum file size must fit into on-disk format containers as
+ * well. Given that length is always by 1 unit bigger than max unit (because
+ * we count 0 as well) we have to lower the s_maxbytes by one fs block.
+ *
  * Note, this does *not* consider any metadata overhead for vfs i_blocks.
  */
-static loff_t ext4_max_size(int blkbits)
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
 {
 	loff_t res;
 	loff_t upper_limit = MAX_LFS_FILESIZE;
 
 	/* small i_blocks in vfs inode? */
-	if (sizeof(blkcnt_t) < sizeof(u64)) {
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * CONFIG_LSF is not enabled implies the inode
+		 * CONFIG_LBDAF is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
 		 * 32 == size of vfs inode i_blocks * 8
 		 */
@@ -1665,10 +2309,13 @@ static loff_t ext4_max_size(int blkbits)
 		upper_limit <<= blkbits;
 	}
 
-	/* 32-bit extent-start container, ee_block */
-	res = 1LL << 32;
+	/*
+	 * 32-bit extent-start container, ee_block. We lower the maxbytes
+	 * by one fs block, so ee_len can cover the extent of maximum file
+	 * size
+	 */
+	res = (1LL << 32) - 1;
 	res <<= blkbits;
-	res -= 1;
 
 	/* Sanity check against vm- & vfs- imposed limits */
 	if (res > upper_limit)
@@ -1682,24 +2329,24 @@ static loff_t ext4_max_size(int blkbits)
  * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
  * We need to be 1 filesystem block less than the 2^48 sector limit.
  */
-static loff_t ext4_max_bitmap_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 {
 	loff_t res = EXT4_NDIR_BLOCKS;
 	int meta_blocks;
 	loff_t upper_limit;
-	/* This is calculated to be the largest file size for a
-	 * dense, bitmapped file such that the total number of
-	 * sectors in the file, including data and all indirect blocks,
-	 * does not exceed 2^48 -1
-	 * __u32 i_blocks_lo and _u16 i_blocks_high representing the
-	 * total number of  512 bytes blocks of the file
+	/* This is calculated to be the largest file size for a dense, block
+	 * mapped file such that the file's total number of 512-byte sectors,
+	 * including data and all indirect blocks, does not exceed (2^48 - 1).
+	 *
+	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+	 * number of 512-byte sectors of the file.
 	 */
 
-	if (sizeof(blkcnt_t) < sizeof(u64)) {
+	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * CONFIG_LSF is not enabled implies the inode
-		 * i_block represent total blocks in 512 bytes
-		 * 32 == size of vfs inode i_blocks * 8
+		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
+		 * the inode i_block field represents total file blocks in
+		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
 		 */
 		upper_limit = (1LL << 32) - 1;
 
@@ -1741,7 +2388,7 @@ static loff_t ext4_max_bitmap_size(int bits)
 }
 
 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
-				ext4_fsblk_t logical_sb_block, int nr)
+				   ext4_fsblk_t logical_sb_block, int nr)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t bg, first_meta_bg;
@@ -1755,6 +2402,17 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
 	bg = sbi->s_desc_per_block * nr;
 	if (ext4_bg_has_super(sb, bg))
 		has_super = 1;
+
+	/*
+	 * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
+	 * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
+	 * on modern mke2fs or blksize > 1k on older mke2fs) then we must
+	 * compensate.
+	 */
+	if (sb->s_blocksize == 1024 && nr == 0 &&
+	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0)
+		has_super++;
+
 	return (has_super + ext4_group_first_block_no(sb, bg));
 }
 
@@ -1774,63 +2432,999 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 	unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
 	unsigned long stripe_width =
 			le32_to_cpu(sbi->s_es->s_raid_stripe_width);
+	int ret;
 
 	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
-		return sbi->s_stripe;
+		ret = sbi->s_stripe;
+	else if (stripe_width <= sbi->s_blocks_per_group)
+		ret = stripe_width;
+	else if (stride <= sbi->s_blocks_per_group)
+		ret = stride;
+	else
+		ret = 0;
+
+	/*
+	 * If the stripe width is 1, this makes no sense and
+	 * we set it to 0 to turn off stripe handling code.
+	 */
+	if (ret <= 1)
+		ret = 0;
+
+	return ret;
+}
+
+/* sysfs supprt */
+
+struct ext4_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+			 const char *, size_t);
+	union {
+		int offset;
+		int deprecated_val;
+	} u;
+};
+
+static int parse_strtoull(const char *buf,
+		unsigned long long max, unsigned long long *value)
+{
+	int ret;
+
+	ret = kstrtoull(skip_spaces(buf), 0, value);
+	if (!ret && *value > max)
+		ret = -EINVAL;
+	return ret;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(s64) EXT4_C2B(sbi,
+			percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+					 struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+	return snprintf(buf, PAGE_SIZE, "%lu\n",
+			(part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			 sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi, char *buf)
+{
+	struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+	if (!sb->s_bdev->bd_part)
+		return snprintf(buf, PAGE_SIZE, "0\n");
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)(sbi->s_kbytes_written +
+			((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+					  struct ext4_sb_info *sbi,
+					  const char *buf, size_t count)
+{
+	unsigned long t;
+	int ret;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
+
+	if (t && (!is_power_of_2(t) || t > 0x40000000))
+		return -EINVAL;
+
+	sbi->s_inode_readahead_blks = t;
+	return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+			   struct ext4_sb_info *sbi, char *buf)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+			    struct ext4_sb_info *sbi,
+			    const char *buf, size_t count)
+{
+	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
+	unsigned long t;
+	int ret;
+
+	ret = kstrtoul(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
+	*ui = t;
+	return count;
+}
+
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+				  struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi,
+				   const char *buf, size_t count)
+{
+	unsigned long long val;
+	int ret;
+
+	if (parse_strtoull(buf, -1ULL, &val))
+		return -EINVAL;
+	ret = ext4_reserve_clusters(sbi, val);
+
+	return ret ? ret : count;
+}
+
+static ssize_t trigger_test_error(struct ext4_attr *a,
+				  struct ext4_sb_info *sbi,
+				  const char *buf, size_t count)
+{
+	int len = count;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (len && buf[len-1] == '\n')
+		len--;
+
+	if (len)
+		ext4_error(sbi->s_sb, "%.*s", len, buf);
+	return count;
+}
+
+static ssize_t sbi_deprecated_show(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.show	= _show,					\
+	.store	= _store,					\
+	.u = {							\
+		.offset = offsetof(struct ext4_sb_info, _elname),\
+	},							\
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)	\
+	EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+#define EXT4_DEPRECATED_ATTR(_name, _val)	\
+static struct ext4_attr ext4_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = 0444 },	\
+	.show	= sbi_deprecated_show,				\
+	.u = {							\
+		.deprecated_val = _val,				\
+	},							\
+}
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+		 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+
+static struct attribute *ext4_attrs[] = {
+	ATTR_LIST(delayed_allocation_blocks),
+	ATTR_LIST(session_write_kbytes),
+	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(reserved_clusters),
+	ATTR_LIST(inode_readahead_blks),
+	ATTR_LIST(inode_goal),
+	ATTR_LIST(mb_stats),
+	ATTR_LIST(mb_max_to_scan),
+	ATTR_LIST(mb_min_to_scan),
+	ATTR_LIST(mb_order2_req),
+	ATTR_LIST(mb_stream_req),
+	ATTR_LIST(mb_group_prealloc),
+	ATTR_LIST(max_writeback_mb_bump),
+	ATTR_LIST(extent_max_zeroout_kb),
+	ATTR_LIST(trigger_fs_error),
+	ATTR_LIST(err_ratelimit_interval_ms),
+	ATTR_LIST(err_ratelimit_burst),
+	ATTR_LIST(warning_ratelimit_interval_ms),
+	ATTR_LIST(warning_ratelimit_burst),
+	ATTR_LIST(msg_ratelimit_interval_ms),
+	ATTR_LIST(msg_ratelimit_burst),
+	NULL,
+};
+
+/* Features this copy of ext4 supports */
+EXT4_INFO_ATTR(lazy_itable_init);
+EXT4_INFO_ATTR(batched_discard);
+EXT4_INFO_ATTR(meta_bg_resize);
+
+static struct attribute *ext4_feat_attrs[] = {
+	ATTR_LIST(lazy_itable_init),
+	ATTR_LIST(batched_discard),
+	ATTR_LIST(meta_bg_resize),
+	NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+	return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+	struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+						s_kobj);
+	complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops ext4_attr_ops = {
+	.show	= ext4_attr_show,
+	.store	= ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+	.default_attrs	= ext4_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_sb_release,
+};
+
+static void ext4_feat_release(struct kobject *kobj)
+{
+	complete(&ext4_feat->f_kobj_unregister);
+}
+
+static struct kobj_type ext4_feat_ktype = {
+	.default_attrs	= ext4_feat_attrs,
+	.sysfs_ops	= &ext4_attr_ops,
+	.release	= ext4_feat_release,
+};
+
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount because of "
+			"unsupported optional features (%x)",
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+			~EXT4_FEATURE_INCOMPAT_SUPP));
+		return 0;
+	}
+
+	if (readonly)
+		return 1;
+
+	/* Check that feature set is OK for a read-write mount */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+		ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+			 "unsupported optional features (%x)",
+			 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+				~EXT4_FEATURE_RO_COMPAT_SUPP));
+		return 0;
+	}
+	/*
+	 * Large file size enabled file system can only be mounted
+	 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+	 */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+		if (sizeof(blkcnt_t) < sizeof(u64)) {
+			ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+				 "cannot be mounted RDWR without "
+				 "CONFIG_LBDAF");
+			return 0;
+		}
+	}
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
+	    !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Can't support bigalloc feature without "
+			 "extents feature\n");
+		return 0;
+	}
+
+#ifndef CONFIG_QUOTA
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+	    !readonly) {
+		ext4_msg(sb, KERN_ERR,
+			 "Filesystem with quota feature cannot be mounted RDWR "
+			 "without CONFIG_QUOTA");
+		return 0;
+	}
+#endif  /* CONFIG_QUOTA */
+	return 1;
+}
+
+/*
+ * This function is called once a day if we have errors logged
+ * on the file system
+ */
+static void print_daily_error_info(unsigned long arg)
+{
+	struct super_block *sb = (struct super_block *) arg;
+	struct ext4_sb_info *sbi;
+	struct ext4_super_block *es;
+
+	sbi = EXT4_SB(sb);
+	es = sbi->s_es;
+
+	if (es->s_error_count)
+		/* fsck newer than v1.41.13 is needed to clean this condition. */
+		ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
+			 le32_to_cpu(es->s_error_count));
+	if (es->s_first_error_time) {
+		printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
+		       sb->s_id, le32_to_cpu(es->s_first_error_time),
+		       (int) sizeof(es->s_first_error_func),
+		       es->s_first_error_func,
+		       le32_to_cpu(es->s_first_error_line));
+		if (es->s_first_error_ino)
+			printk(": inode %u",
+			       le32_to_cpu(es->s_first_error_ino));
+		if (es->s_first_error_block)
+			printk(": block %llu", (unsigned long long)
+			       le64_to_cpu(es->s_first_error_block));
+		printk("\n");
+	}
+	if (es->s_last_error_time) {
+		printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
+		       sb->s_id, le32_to_cpu(es->s_last_error_time),
+		       (int) sizeof(es->s_last_error_func),
+		       es->s_last_error_func,
+		       le32_to_cpu(es->s_last_error_line));
+		if (es->s_last_error_ino)
+			printk(": inode %u",
+			       le32_to_cpu(es->s_last_error_ino));
+		if (es->s_last_error_block)
+			printk(": block %llu", (unsigned long long)
+			       le64_to_cpu(es->s_last_error_block));
+		printk("\n");
+	}
+	mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
+}
+
+/* Find next suitable group and run ext4_init_inode_table */
+static int ext4_run_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_group_desc *gdp = NULL;
+	ext4_group_t group, ngroups;
+	struct super_block *sb;
+	unsigned long timeout = 0;
+	int ret = 0;
+
+	sb = elr->lr_super;
+	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	sb_start_write(sb);
+	for (group = elr->lr_next_group; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp) {
+			ret = 1;
+			break;
+		}
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	if (group >= ngroups)
+		ret = 1;
+
+	if (!ret) {
+		timeout = jiffies;
+		ret = ext4_init_inode_table(sb, group,
+					    elr->lr_timeout ? 0 : 1);
+		if (elr->lr_timeout == 0) {
+			timeout = (jiffies - timeout) *
+				  elr->lr_sbi->s_li_wait_mult;
+			elr->lr_timeout = timeout;
+		}
+		elr->lr_next_sched = jiffies + elr->lr_timeout;
+		elr->lr_next_group = group + 1;
+	}
+	sb_end_write(sb);
+
+	return ret;
+}
+
+/*
+ * Remove lr_request from the list_request and free the
+ * request structure. Should be called with li_list_mtx held
+ */
+static void ext4_remove_li_request(struct ext4_li_request *elr)
+{
+	struct ext4_sb_info *sbi;
+
+	if (!elr)
+		return;
+
+	sbi = elr->lr_sbi;
+
+	list_del(&elr->lr_request);
+	sbi->s_li_request = NULL;
+	kfree(elr);
+}
+
+static void ext4_unregister_li_request(struct super_block *sb)
+{
+	mutex_lock(&ext4_li_mtx);
+	if (!ext4_li_info) {
+		mutex_unlock(&ext4_li_mtx);
+		return;
+	}
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+	mutex_unlock(&ext4_li_mtx);
+}
+
+static struct task_struct *ext4_lazyinit_task;
+
+/*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+ * When such a fs is found, run the lazy initialization request
+ * (ext4_rn_li_request) and keep track of the time spend in this
+ * function. Based on that time we compute next schedule time of
+ * the request. When walking through the list is complete, compute
+ * next waking time and put itself into sleep.
+ */
+static int ext4_lazyinit_thread(void *arg)
+{
+	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+	unsigned long next_wakeup, cur;
+
+	BUG_ON(NULL == eli);
+
+cont_thread:
+	while (true) {
+		next_wakeup = MAX_JIFFY_OFFSET;
+
+		mutex_lock(&eli->li_list_mtx);
+		if (list_empty(&eli->li_request_list)) {
+			mutex_unlock(&eli->li_list_mtx);
+			goto exit_thread;
+		}
+
+		list_for_each_safe(pos, n, &eli->li_request_list) {
+			elr = list_entry(pos, struct ext4_li_request,
+					 lr_request);
+
+			if (time_after_eq(jiffies, elr->lr_next_sched)) {
+				if (ext4_run_li_request(elr) != 0) {
+					/* error, remove the lazy_init job */
+					ext4_remove_li_request(elr);
+					continue;
+				}
+			}
+
+			if (time_before(elr->lr_next_sched, next_wakeup))
+				next_wakeup = elr->lr_next_sched;
+		}
+		mutex_unlock(&eli->li_list_mtx);
+
+		try_to_freeze();
+
+		cur = jiffies;
+		if ((time_after_eq(cur, next_wakeup)) ||
+		    (MAX_JIFFY_OFFSET == next_wakeup)) {
+			cond_resched();
+			continue;
+		}
+
+		schedule_timeout_interruptible(next_wakeup - cur);
+
+		if (kthread_should_stop()) {
+			ext4_clear_request_list();
+			goto exit_thread;
+		}
+	}
+
+exit_thread:
+	/*
+	 * It looks like the request list is empty, but we need
+	 * to check it under the li_list_mtx lock, to prevent any
+	 * additions into it, and of course we should lock ext4_li_mtx
+	 * to atomically free the list and ext4_li_info, because at
+	 * this point another ext4 filesystem could be registering
+	 * new one.
+	 */
+	mutex_lock(&ext4_li_mtx);
+	mutex_lock(&eli->li_list_mtx);
+	if (!list_empty(&eli->li_request_list)) {
+		mutex_unlock(&eli->li_list_mtx);
+		mutex_unlock(&ext4_li_mtx);
+		goto cont_thread;
+	}
+	mutex_unlock(&eli->li_list_mtx);
+	kfree(ext4_li_info);
+	ext4_li_info = NULL;
+	mutex_unlock(&ext4_li_mtx);
+
+	return 0;
+}
+
+static void ext4_clear_request_list(void)
+{
+	struct list_head *pos, *n;
+	struct ext4_li_request *elr;
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
+		elr = list_entry(pos, struct ext4_li_request,
+				 lr_request);
+		ext4_remove_li_request(elr);
+	}
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+}
+
+static int ext4_run_lazyinit_thread(void)
+{
+	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+					 ext4_li_info, "ext4lazyinit");
+	if (IS_ERR(ext4_lazyinit_task)) {
+		int err = PTR_ERR(ext4_lazyinit_task);
+		ext4_clear_request_list();
+		kfree(ext4_li_info);
+		ext4_li_info = NULL;
+		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
+				 "initialization thread\n",
+				 err);
+		return err;
+	}
+	ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
+	return 0;
+}
+
+/*
+ * Check whether it make sense to run itable init. thread or not.
+ * If there is at least one uninitialized inode table, return
+ * corresponding group number, else the loop goes through all
+ * groups and return total number of groups.
+ */
+static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
+{
+	ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
+	struct ext4_group_desc *gdp = NULL;
+
+	for (group = 0; group < ngroups; group++) {
+		gdp = ext4_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			continue;
+
+		if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
+			break;
+	}
+
+	return group;
+}
+
+static int ext4_li_info_new(void)
+{
+	struct ext4_lazy_init *eli = NULL;
+
+	eli = kzalloc(sizeof(*eli), GFP_KERNEL);
+	if (!eli)
+		return -ENOMEM;
 
-	if (stripe_width <= sbi->s_blocks_per_group)
-		return stripe_width;
+	INIT_LIST_HEAD(&eli->li_request_list);
+	mutex_init(&eli->li_list_mtx);
 
-	if (stride <= sbi->s_blocks_per_group)
-		return stride;
+	eli->li_state |= EXT4_LAZYINIT_QUIT;
+
+	ext4_li_info = eli;
 
 	return 0;
 }
 
-static int ext4_fill_super (struct super_block *sb, void *data, int silent)
-				__releases(kernel_sem)
-				__acquires(kernel_sem)
+static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
+					    ext4_group_t start)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr;
+
+	elr = kzalloc(sizeof(*elr), GFP_KERNEL);
+	if (!elr)
+		return NULL;
+
+	elr->lr_super = sb;
+	elr->lr_sbi = sbi;
+	elr->lr_next_group = start;
+
+	/*
+	 * Randomize first schedule time of the request to
+	 * spread the inode table initialization requests
+	 * better.
+	 */
+	elr->lr_next_sched = jiffies + (prandom_u32() %
+				(EXT4_DEF_LI_MAX_START_DELAY * HZ));
+	return elr;
+}
+
+int ext4_register_li_request(struct super_block *sb,
+			     ext4_group_t first_not_zeroed)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_li_request *elr = NULL;
+	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	int ret = 0;
+
+	mutex_lock(&ext4_li_mtx);
+	if (sbi->s_li_request != NULL) {
+		/*
+		 * Reset timeout so it can be computed again, because
+		 * s_li_wait_mult might have changed.
+		 */
+		sbi->s_li_request->lr_timeout = 0;
+		goto out;
+	}
+
+	if (first_not_zeroed == ngroups ||
+	    (sb->s_flags & MS_RDONLY) ||
+	    !test_opt(sb, INIT_INODE_TABLE))
+		goto out;
+
+	elr = ext4_li_request_new(sb, first_not_zeroed);
+	if (!elr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (NULL == ext4_li_info) {
+		ret = ext4_li_info_new();
+		if (ret)
+			goto out;
+	}
+
+	mutex_lock(&ext4_li_info->li_list_mtx);
+	list_add(&elr->lr_request, &ext4_li_info->li_request_list);
+	mutex_unlock(&ext4_li_info->li_list_mtx);
+
+	sbi->s_li_request = elr;
+	/*
+	 * set elr to NULL here since it has been inserted to
+	 * the request_list and the removal and free of it is
+	 * handled by ext4_clear_request_list from now on.
+	 */
+	elr = NULL;
+
+	if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
+		ret = ext4_run_lazyinit_thread();
+		if (ret)
+			goto out;
+	}
+out:
+	mutex_unlock(&ext4_li_mtx);
+	if (ret)
+		kfree(elr);
+	return ret;
+}
+
+/*
+ * We do not need to lock anything since this is called on
+ * module unload.
+ */
+static void ext4_destroy_lazyinit_thread(void)
+{
+	/*
+	 * If thread exited earlier
+	 * there's nothing to be done.
+	 */
+	if (!ext4_li_info || !ext4_lazyinit_task)
+		return;
+
+	kthread_stop(ext4_lazyinit_task);
+}
+
+static int set_journal_csum_feature_set(struct super_block *sb)
+{
+	int ret = 1;
+	int compat, incompat;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+		/* journal checksum v2 */
+		compat = 0;
+		incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+	} else {
+		/* journal checksum v1 */
+		compat = JBD2_FEATURE_COMPAT_CHECKSUM;
+		incompat = 0;
+	}
+
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+		ret = jbd2_journal_set_features(sbi->s_journal,
+				compat, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+				incompat);
+	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+		ret = jbd2_journal_set_features(sbi->s_journal,
+				compat, 0,
+				incompat);
+		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	} else {
+		jbd2_journal_clear_features(sbi->s_journal,
+				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+				JBD2_FEATURE_INCOMPAT_CSUM_V2);
+	}
+
+	return ret;
+}
+
+/*
+ * Note: calculating the overhead so we can be compatible with
+ * historical BSD practice is quite difficult in the face of
+ * clusters/bigalloc.  This is because multiple metadata blocks from
+ * different block group can end up in the same allocation cluster.
+ * Calculating the exact overhead in the face of clustered allocation
+ * requires either O(all block bitmaps) in memory or O(number of block
+ * groups**2) in time.  We will still calculate the superblock for
+ * older file systems --- and if we come across with a bigalloc file
+ * system with zero in s_overhead_clusters the estimate will be close to
+ * correct especially for very large cluster sizes --- but for newer
+ * file systems, it's better to calculate this figure once at mkfs
+ * time, and store it in the superblock.  If the superblock value is
+ * present (even for non-bigalloc file systems), we will use it.
+ */
+static int count_overhead(struct super_block *sb, ext4_group_t grp,
+			  char *buf)
+{
+	struct ext4_sb_info	*sbi = EXT4_SB(sb);
+	struct ext4_group_desc	*gdp;
+	ext4_fsblk_t		first_block, last_block, b;
+	ext4_group_t		i, ngroups = ext4_get_groups_count(sb);
+	int			s, j, count = 0;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+		return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+			sbi->s_itb_per_group + 2);
+
+	first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
+		(grp * EXT4_BLOCKS_PER_GROUP(sb));
+	last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		b = ext4_block_bitmap(sb, gdp);
+		if (b >= first_block && b <= last_block) {
+			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+			count++;
+		}
+		b = ext4_inode_bitmap(sb, gdp);
+		if (b >= first_block && b <= last_block) {
+			ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
+			count++;
+		}
+		b = ext4_inode_table(sb, gdp);
+		if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
+			for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
+				int c = EXT4_B2C(sbi, b - first_block);
+				ext4_set_bit(c, buf);
+				count++;
+			}
+		if (i != grp)
+			continue;
+		s = 0;
+		if (ext4_bg_has_super(sb, grp)) {
+			ext4_set_bit(s++, buf);
+			count++;
+		}
+		for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
+			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
+			count++;
+		}
+	}
+	if (!count)
+		return 0;
+	return EXT4_CLUSTERS_PER_GROUP(sb) -
+		ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
+}
 
+/*
+ * Compute the overhead and stash it in sbi->s_overhead
+ */
+int ext4_calculate_overhead(struct super_block *sb)
 {
-	struct buffer_head * bh;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+	ext4_fsblk_t overhead = 0;
+	char *buf = (char *) get_zeroed_page(GFP_KERNEL);
+
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * Compute the overhead (FS structures).  This is constant
+	 * for a given filesystem unless the number of block groups
+	 * changes so we cache the previous value until it does.
+	 */
+
+	/*
+	 * All of the blocks before first_data_block are overhead
+	 */
+	overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
+
+	/*
+	 * Add the overhead found in each block group
+	 */
+	for (i = 0; i < ngroups; i++) {
+		int blks;
+
+		blks = count_overhead(sb, i, buf);
+		overhead += blks;
+		if (blks)
+			memset(buf, 0, PAGE_SIZE);
+		cond_resched();
+	}
+	/* Add the journal blocks as well */
+	if (sbi->s_journal)
+		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
+
+	sbi->s_overhead = overhead;
+	smp_wmb();
+	free_page((unsigned long) buf);
+	return 0;
+}
+
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
+{
+	ext4_fsblk_t resv_clusters;
+
+	/*
+	 * There's no need to reserve anything when we aren't using extents.
+	 * The space estimates are exact, there are no unwritten extents,
+	 * hole punching doesn't need new metadata... This is needed especially
+	 * to keep ext2/3 backward compatibility.
+	 */
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+		return 0;
+	/*
+	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
+	 * This should cover the situations where we can not afford to run
+	 * out of space like for example punch hole, or converting
+	 * unwritten extents in delalloc path. In most cases such
+	 * allocation would require 1, or 2 blocks, higher numbers are
+	 * very rare.
+	 */
+	resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
+			EXT4_SB(sb)->s_cluster_bits;
+
+	do_div(resv_clusters, 50);
+	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+	return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+	ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+				sbi->s_cluster_bits;
+
+	if (count >= clusters)
+		return -EINVAL;
+
+	atomic64_set(&sbi->s_resv_clusters, count);
+	return 0;
+}
+
+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+{
+	char *orig_data = kstrdup(data, GFP_KERNEL);
+	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi;
 	ext4_fsblk_t block;
 	ext4_fsblk_t sb_block = get_sb_block(&data);
 	ext4_fsblk_t logical_sb_block;
 	unsigned long offset = 0;
-	unsigned int journal_inum = 0;
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
-	int ret = -EINVAL;
-	int blocksize;
-	int db_count;
-	int i;
-	int needs_recovery;
-	__le32 features;
+	char *cp;
+	const char *descr;
+	int ret = -ENOMEM;
+	int blocksize, clustersize;
+	unsigned int db_count;
+	unsigned int i;
+	int needs_recovery, has_huge_files, has_bigalloc;
 	__u64 blocks_count;
-	int err;
+	int err = 0;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	ext4_group_t first_not_zeroed;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		return -ENOMEM;
+		goto out_free_orig;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		goto out_free_orig;
+	}
 	sb->s_fs_info = sbi;
-	sbi->s_mount_opt = 0;
-	sbi->s_resuid = EXT4_DEF_RESUID;
-	sbi->s_resgid = EXT4_DEF_RESGID;
+	sbi->s_sb = sb;
+	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
 	sbi->s_sb_block = sb_block;
+	if (sb->s_bdev->bd_part)
+		sbi->s_sectors_written_start =
+			part_stat_read(sb->s_bdev->bd_part, sectors[1]);
 
-	unlock_kernel();
+	/* Cleanup superblock name */
+	for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+		*cp = '!';
 
+	/* -EINVAL is default */
+	ret = -EINVAL;
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
-		printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
-		goto out_fail;
-	}
-
-	if (!sb_set_blocksize(sb, blocksize)) {
-		printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
+		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
 		goto out_fail;
 	}
 
@@ -1846,88 +3440,186 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	}
 
 	if (!(bh = sb_bread(sb, logical_sb_block))) {
-		printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
+		ext4_msg(sb, KERN_ERR, "unable to read superblock");
 		goto out_fail;
 	}
 	/*
 	 * Note: s_es must be initialized as soon as possible because
 	 *       some ext4 macro-instructions depend on its value
 	 */
-	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+	es = (struct ext4_super_block *) (bh->b_data + offset);
 	sbi->s_es = es;
 	sb->s_magic = le16_to_cpu(es->s_magic);
 	if (sb->s_magic != EXT4_SUPER_MAGIC)
 		goto cantfind_ext4;
+	sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
+
+	/* Warn if metadata_csum and gdt_csum are both set. */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+	    EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+		ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
+			     "redundant flags; please run fsck.");
+
+	/* Check for a known checksum algorithm */
+	if (!ext4_verify_csum_type(sb, es)) {
+		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+			 "unknown checksum algorithm.");
+		silent = 1;
+		goto cantfind_ext4;
+	}
+
+	/* Load the checksum driver */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+		sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+		if (IS_ERR(sbi->s_chksum_driver)) {
+			ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+			ret = PTR_ERR(sbi->s_chksum_driver);
+			sbi->s_chksum_driver = NULL;
+			goto failed_mount;
+		}
+	}
+
+	/* Check superblock checksum */
+	if (!ext4_superblock_csum_verify(sb, es)) {
+		ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+			 "invalid superblock checksum.  Run e2fsck?");
+		silent = 1;
+		goto cantfind_ext4;
+	}
+
+	/* Precompute checksum seed for all metadata */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+					       sizeof(es->s_uuid));
 
 	/* Set defaults before we parse the mount options */
 	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	set_opt(sb, INIT_INODE_TABLE);
 	if (def_mount_opts & EXT4_DEFM_DEBUG)
-		set_opt(sbi->s_mount_opt, DEBUG);
+		set_opt(sb, DEBUG);
 	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
-		set_opt(sbi->s_mount_opt, GRPID);
+		set_opt(sb, GRPID);
 	if (def_mount_opts & EXT4_DEFM_UID16)
-		set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
-	if (def_mount_opts & EXT4_DEFM_XATTR_USER)
-		set_opt(sbi->s_mount_opt, XATTR_USER);
-#endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-	if (def_mount_opts & EXT4_DEFM_ACL)
-		set_opt(sbi->s_mount_opt, POSIX_ACL);
+		set_opt(sb, NO_UID32);
+	/* xattr user namespace & acls are now defaulted on */
+	set_opt(sb, XATTR_USER);
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	set_opt(sb, POSIX_ACL);
 #endif
 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
-		sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+		set_opt(sb, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-		sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+		set_opt(sb, ORDERED_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
-		sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+		set_opt(sb, WRITEBACK_DATA);
 
 	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+		set_opt(sb, ERRORS_PANIC);
 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
-		set_opt(sbi->s_mount_opt, ERRORS_CONT);
+		set_opt(sb, ERRORS_CONT);
 	else
-		set_opt(sbi->s_mount_opt, ERRORS_RO);
+		set_opt(sb, ERRORS_RO);
+	if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
+		set_opt(sb, BLOCK_VALIDITY);
+	if (def_mount_opts & EXT4_DEFM_DISCARD)
+		set_opt(sb, DISCARD);
 
-	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
-	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+	sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
+	sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
+	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
-	set_opt(sbi->s_mount_opt, RESERVATION);
+	if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
+		set_opt(sb, BARRIER);
 
 	/*
-	 * turn on extents feature by default in ext4 filesystem
-	 * User -o noextents to turn it off
+	 * enable delayed allocation by default
+	 * Use -o nodelalloc to turn it off
 	 */
-	set_opt(sbi->s_mount_opt, EXTENTS);
+	if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
+	    ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
+		set_opt(sb, DELALLOC);
+
 	/*
-	 * turn on mballoc feature by default in ext4 filesystem
-	 * User -o nomballoc to turn it off
+	 * set default s_li_wait_mult for lazyinit, for the case there is
+	 * no mount option specified.
 	 */
-	set_opt(sbi->s_mount_opt, MBALLOC);
+	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
-	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
-			    NULL, 0))
+	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
+			   &journal_devnum, &journal_ioprio, 0)) {
+		ext4_msg(sb, KERN_WARNING,
+			 "failed to parse options in superblock: %s",
+			 sbi->s_es->s_mount_opts);
+	}
+	sbi->s_def_mount_opt = sbi->s_mount_opt;
+	if (!parse_options((char *) data, sb, &journal_devnum,
+			   &journal_ioprio, 0))
 		goto failed_mount;
 
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+			    "with data=journal disables delayed "
+			    "allocation and O_DIRECT support!\n");
+		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and delalloc");
+			goto failed_mount;
+		}
+		if (test_opt(sb, DIOREAD_NOLOCK)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and dioread_nolock");
+			goto failed_mount;
+		}
+		if (test_opt(sb, DELALLOC))
+			clear_opt(sb, DELALLOC);
+	}
+
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
 	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: feature flags set on rev 0 fs, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+		       "feature flags set on rev 0 fs, "
+		       "running e2fsck is recommended");
+
+	if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+		set_opt2(sb, HURD_COMPAT);
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_64BIT)) {
+			ext4_msg(sb, KERN_ERR,
+				 "The Hurd can't support 64-bit file systems");
+			goto failed_mount;
+		}
+	}
 
-	/*
-	 * Since ext4 is still considered development code, we require
-	 * that the TEST_FILESYS flag in s->flags be set.
-	 */
-	if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
-		printk(KERN_WARNING "EXT4-fs: %s: not marked "
-		       "OK to use with test code.\n", sb->s_id);
-		goto failed_mount;
+	if (IS_EXT2_SB(sb)) {
+		if (ext2_feature_set_ok(sb))
+			ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+				 "using the ext4 subsystem");
+		else {
+			ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+				 "to feature incompatibilities");
+			goto failed_mount;
+		}
+	}
+
+	if (IS_EXT3_SB(sb)) {
+		if (ext3_feature_set_ok(sb))
+			ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+				 "using the ext4 subsystem");
+		else {
+			ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+				 "to feature incompatibilities");
+			goto failed_mount;
+		}
 	}
 
 	/*
@@ -1935,72 +3627,48 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	 * previously didn't change the revision level when setting the flags,
 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
 	 */
-	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
-	if (features) {
-		printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
-		goto failed_mount;
-	}
-	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
-	if (!(sb->s_flags & MS_RDONLY) && features) {
-		printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
+	if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
 		goto failed_mount;
-	}
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
-		/*
-		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LSF
-		 */
-		if (sizeof(root->i_blocks) < sizeof(u64) &&
-				!(sb->s_flags & MS_RDONLY)) {
-			printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
-					"files cannot be mounted read-write "
-					"without CONFIG_LSF.\n", sb->s_id);
-			goto failed_mount;
-		}
-	}
-	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
 
+	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
 	    blocksize > EXT4_MAX_BLOCK_SIZE) {
-		printk(KERN_ERR
-		       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
-		       blocksize, sb->s_id);
+		ext4_msg(sb, KERN_ERR,
+		       "Unsupported filesystem blocksize %d", blocksize);
 		goto failed_mount;
 	}
 
 	if (sb->s_blocksize != blocksize) {
-
 		/* Validate the filesystem blocksize */
 		if (!sb_set_blocksize(sb, blocksize)) {
-			printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+			ext4_msg(sb, KERN_ERR, "bad block size %d",
 					blocksize);
 			goto failed_mount;
 		}
 
-		brelse (bh);
+		brelse(bh);
 		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
 		offset = do_div(logical_sb_block, blocksize);
 		bh = sb_bread(sb, logical_sb_block);
 		if (!bh) {
-			printk(KERN_ERR
-			       "EXT4-fs: Can't read superblock on 2nd try.\n");
+			ext4_msg(sb, KERN_ERR,
+			       "Can't read superblock on 2nd try");
 			goto failed_mount;
 		}
-		es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+		es = (struct ext4_super_block *)(bh->b_data + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-			printk (KERN_ERR
-				"EXT4-fs: Magic mismatch, very weird !\n");
+			ext4_msg(sb, KERN_ERR,
+			       "Magic mismatch, very weird!");
 			goto failed_mount;
 		}
 	}
 
-	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
-	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+	sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+						      has_huge_files);
+	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
 
 	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
 		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2011,30 +3679,33 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
 		    (!is_power_of_2(sbi->s_inode_size)) ||
 		    (sbi->s_inode_size > blocksize)) {
-			printk (KERN_ERR
-				"EXT4-fs: unsupported inode size: %d\n",
-				sbi->s_inode_size);
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported inode size: %d",
+			       sbi->s_inode_size);
 			goto failed_mount;
 		}
 		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
 			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
 	}
+
 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
 		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
 		    !is_power_of_2(sbi->s_desc_size)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unsupported descriptor size %lu\n",
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported descriptor size %lu",
 			       sbi->s_desc_size);
 			goto failed_mount;
 		}
 	} else
 		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
+
 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
 		goto cantfind_ext4;
@@ -2045,82 +3716,194 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_mount_state = le16_to_cpu(es->s_state);
 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
-	for (i=0; i < 4; i++)
+
+	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+		i = le32_to_cpu(es->s_flags);
+		if (i & EXT2_FLAGS_UNSIGNED_HASH)
+			sbi->s_hash_unsigned = 3;
+		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+			if (!(sb->s_flags & MS_RDONLY))
+				es->s_flags |=
+					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+			sbi->s_hash_unsigned = 3;
+#else
+			if (!(sb->s_flags & MS_RDONLY))
+				es->s_flags |=
+					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+		}
+	}
 
-	if (sbi->s_blocks_per_group > blocksize * 8) {
-		printk (KERN_ERR
-			"EXT4-fs: #blocks per group too big: %lu\n",
-			sbi->s_blocks_per_group);
-		goto failed_mount;
+	/* Handle clustersize */
+	clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
+	has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+	if (has_bigalloc) {
+		if (clustersize < blocksize) {
+			ext4_msg(sb, KERN_ERR,
+				 "cluster size (%d) smaller than "
+				 "block size (%d)", clustersize, blocksize);
+			goto failed_mount;
+		}
+		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
+			le32_to_cpu(es->s_log_block_size);
+		sbi->s_clusters_per_group =
+			le32_to_cpu(es->s_clusters_per_group);
+		if (sbi->s_clusters_per_group > blocksize * 8) {
+			ext4_msg(sb, KERN_ERR,
+				 "#clusters per group too big: %lu",
+				 sbi->s_clusters_per_group);
+			goto failed_mount;
+		}
+		if (sbi->s_blocks_per_group !=
+		    (sbi->s_clusters_per_group * (clustersize / blocksize))) {
+			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
+				 "clusters per group (%lu) inconsistent",
+				 sbi->s_blocks_per_group,
+				 sbi->s_clusters_per_group);
+			goto failed_mount;
+		}
+	} else {
+		if (clustersize != blocksize) {
+			ext4_warning(sb, "fragment/cluster size (%d) != "
+				     "block size (%d)", clustersize,
+				     blocksize);
+			clustersize = blocksize;
+		}
+		if (sbi->s_blocks_per_group > blocksize * 8) {
+			ext4_msg(sb, KERN_ERR,
+				 "#blocks per group too big: %lu",
+				 sbi->s_blocks_per_group);
+			goto failed_mount;
+		}
+		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
+		sbi->s_cluster_bits = 0;
 	}
+	sbi->s_cluster_ratio = clustersize / blocksize;
+
 	if (sbi->s_inodes_per_group > blocksize * 8) {
-		printk (KERN_ERR
-			"EXT4-fs: #inodes per group too big: %lu\n",
-			sbi->s_inodes_per_group);
+		ext4_msg(sb, KERN_ERR,
+		       "#inodes per group too big: %lu",
+		       sbi->s_inodes_per_group);
 		goto failed_mount;
 	}
 
-	if (ext4_blocks_count(es) >
-		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
-			" too large to mount safely\n", sb->s_id);
+	/* Do we have standard group size of clustersize * 8 blocks ? */
+	if (sbi->s_blocks_per_group == clustersize << 3)
+		set_opt2(sb, STD_GROUP_SIZE);
+
+	/*
+	 * Test whether we have more sectors than will fit in sector_t,
+	 * and whether the max offset is addressable by the page cache.
+	 */
+	err = generic_check_addressable(sb->s_blocksize_bits,
+					ext4_blocks_count(es));
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "filesystem"
+			 " too large to mount safely on this system");
 		if (sizeof(sector_t) < 8)
-			printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
-					"enabled\n");
+			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
 		goto failed_mount;
 	}
 
 	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
 
-	/* ensure blocks_count calculation below doesn't sign-extend */
-	if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
-	    le32_to_cpu(es->s_first_data_block) + 1) {
-		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
-		       "first data block %u, blocks per group %lu\n",
-			ext4_blocks_count(es),
-			le32_to_cpu(es->s_first_data_block),
-			EXT4_BLOCKS_PER_GROUP(sb));
+	/* check blocks count against device size */
+	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+		       "exceeds size of device (%llu blocks)",
+		       ext4_blocks_count(es), blocks_count);
+		goto failed_mount;
+	}
+
+	/*
+	 * It makes no sense for the first data block to be beyond the end
+	 * of the filesystem.
+	 */
+	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+		ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
+			 "block %u is beyond end of filesystem (%llu)",
+			 le32_to_cpu(es->s_first_data_block),
+			 ext4_blocks_count(es));
 		goto failed_mount;
 	}
 	blocks_count = (ext4_blocks_count(es) -
 			le32_to_cpu(es->s_first_data_block) +
 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
 	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
+		       "(block count %llu, first data block %u, "
+		       "blocks per group %lu)", sbi->s_groups_count,
+		       ext4_blocks_count(es),
+		       le32_to_cpu(es->s_first_data_block),
+		       EXT4_BLOCKS_PER_GROUP(sb));
+		goto failed_mount;
+	}
 	sbi->s_groups_count = blocks_count;
+	sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
-	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
-				    GFP_KERNEL);
+	sbi->s_group_desc = ext4_kvmalloc(db_count *
+					  sizeof(struct buffer_head *),
+					  GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
-		printk (KERN_ERR "EXT4-fs: not enough memory\n");
+		ext4_msg(sb, KERN_ERR, "not enough memory");
+		ret = -ENOMEM;
 		goto failed_mount;
 	}
 
-	bgl_lock_init(&sbi->s_blockgroup_lock);
+	if (ext4_proc_root)
+		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+	if (sbi->s_proc)
+		proc_create_data("options", S_IRUGO, sbi->s_proc,
+				 &ext4_seq_options_fops, sb);
+
+	bgl_lock_init(sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logical_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
 		if (!sbi->s_group_desc[i]) {
-			printk (KERN_ERR "EXT4-fs: "
-				"can't read group descriptor %d\n", i);
+			ext4_msg(sb, KERN_ERR,
+			       "can't read group descriptor %d", i);
 			db_count = i;
 			goto failed_mount2;
 		}
 	}
-	if (!ext4_check_descriptors (sb)) {
-		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+	if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
+		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
 		goto failed_mount2;
 	}
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		if (!ext4_fill_flex_info(sb)) {
+			ext4_msg(sb, KERN_ERR,
+			       "unable to initialize "
+			       "flex_bg meta info!");
+			goto failed_mount2;
+		}
+
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-			ext4_count_free_blocks(sb));
+	init_timer(&sbi->s_err_report);
+	sbi->s_err_report.function = print_daily_error_info;
+	sbi->s_err_report.data = (unsigned long) sb;
+
+	/* Register extent status tree shrinker */
+	ext4_es_register_shrinker(sbi);
+
+	err = percpu_counter_init(&sbi->s_freeclusters_counter,
+			ext4_count_free_clusters(sb));
 	if (!err) {
 		err = percpu_counter_init(&sbi->s_freeinodes_counter,
 				ext4_count_free_inodes(sb));
@@ -2129,37 +3912,41 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		err = percpu_counter_init(&sbi->s_dirs_counter,
 				ext4_count_dirs(sb));
 	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0);
+	}
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount3;
 	}
 
-	/* per fileystem reservation list head & lock */
-	spin_lock_init(&sbi->s_rsv_window_lock);
-	sbi->s_rsv_window_root = RB_ROOT;
-	/* Add a single, static dummy reservation to the start of the
-	 * reservation window list --- it gives us a placeholder for
-	 * append-at-start-of-list which makes the allocation logic
-	 * _much_ simpler. */
-	sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
-	sbi->s_rsv_window_head.rsv_goal_size = 0;
-	ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
-
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
+	sbi->s_extent_max_zeroout_kb = 32;
 
 	/*
 	 * set up enough so that it can read an inode
 	 */
-	sb->s_op = &ext4_sops;
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+		sb->s_op = &ext4_sops;
+	else
+		sb->s_op = &ext4_nojournal_sops;
 	sb->s_export_op = &ext4_export_ops;
 	sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
-	sb->s_qcop = &ext4_qctl_operations;
 	sb->dq_op = &ext4_quota_operations;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+		sb->s_qcop = &ext4_qctl_sysfile_operations;
+	else
+		sb->s_qcop = &ext4_qctl_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+	mutex_init(&sbi->s_orphan_lock);
 
 	sb->s_root = NULL;
 
@@ -2167,6 +3954,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 			  EXT4_HAS_INCOMPAT_FEATURE(sb,
 				    EXT4_FEATURE_INCOMPAT_RECOVER));
 
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+	    !(sb->s_flags & MS_RDONLY))
+		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+			goto failed_mount3;
+
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
@@ -2175,37 +3967,29 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext4_load_journal(sb, es, journal_devnum))
 			goto failed_mount3;
-	} else if (journal_inum) {
-		if (ext4_create_journal(sb, es, journal_inum))
-			goto failed_mount3;
+	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+		ext4_msg(sb, KERN_ERR, "required journal recovery "
+		       "suppressed and not mounted read-only");
+		goto failed_mount_wq;
 	} else {
-		if (!silent)
-			printk (KERN_ERR
-				"ext4: No journal on filesystem on %s\n",
-				sb->s_id);
-		goto failed_mount3;
+		clear_opt(sb, DATA_FLAGS);
+		sbi->s_journal = NULL;
+		needs_recovery = 0;
+		goto no_journal;
 	}
 
-	if (ext4_blocks_count(es) > 0xffffffffULL &&
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
-		printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
-		goto failed_mount4;
+		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
+		goto failed_mount_wq;
 	}
 
-	if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
-		jbd2_journal_set_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	} else if (test_opt(sb, JOURNAL_CHECKSUM)) {
-		jbd2_journal_set_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
-		jbd2_journal_clear_features(sbi->s_journal, 0, 0,
-				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-	} else {
-		jbd2_journal_clear_features(sbi->s_journal,
-				JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-				JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+	if (!set_journal_csum_feature_set(sb)) {
+		ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+			 "feature set");
+		goto failed_mount_wq;
 	}
 
 	/* We have now updated the journal if required, so we can
@@ -2218,30 +4002,71 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		 */
 		if (jbd2_journal_check_available_features
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
-			set_opt(sbi->s_mount_opt, ORDERED_DATA);
+			set_opt(sb, ORDERED_DATA);
 		else
-			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+			set_opt(sb, JOURNAL_DATA);
 		break;
 
 	case EXT4_MOUNT_ORDERED_DATA:
 	case EXT4_MOUNT_WRITEBACK_DATA:
 		if (!jbd2_journal_check_available_features
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
-			printk(KERN_ERR "EXT4-fs: Journal does not support "
-			       "requested data journaling mode\n");
-			goto failed_mount4;
+			ext4_msg(sb, KERN_ERR, "Journal does not support "
+			       "requested data journaling mode");
+			goto failed_mount_wq;
 		}
 	default:
 		break;
 	}
+	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
-	if (test_opt(sb, NOBH)) {
-		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-			printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
-				"its supported only with writeback mode\n");
-			clear_opt(sbi->s_mount_opt, NOBH);
+	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
+	/*
+	 * The journal may have updated the bg summary counts, so we
+	 * need to update the global counters.
+	 */
+	percpu_counter_set(&sbi->s_freeclusters_counter,
+			   ext4_count_free_clusters(sb));
+	percpu_counter_set(&sbi->s_freeinodes_counter,
+			   ext4_count_free_inodes(sb));
+	percpu_counter_set(&sbi->s_dirs_counter,
+			   ext4_count_dirs(sb));
+	percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
+
+no_journal:
+	if (ext4_mballoc_ready) {
+		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+		if (!sbi->s_mb_cache) {
+			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+			goto failed_mount_wq;
 		}
 	}
+
+	/*
+	 * Get the # of file system overhead blocks from the
+	 * superblock if present.
+	 */
+	if (es->s_overhead_clusters)
+		sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+	else {
+		err = ext4_calculate_overhead(sb);
+		if (err)
+			goto failed_mount_wq;
+	}
+
+	/*
+	 * The maximum number of concurrent works can be high and
+	 * concurrency isn't really necessary.  Limit it to 1.
+	 */
+	EXT4_SB(sb)->rsv_conversion_wq =
+		alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+	if (!EXT4_SB(sb)->rsv_conversion_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
+		ret = -ENOMEM;
+		goto failed_mount4;
+	}
+
 	/*
 	 * The jbd2_journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
@@ -2249,24 +4074,25 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 
 	root = ext4_iget(sb, EXT4_ROOT_INO);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+		ext4_msg(sb, KERN_ERR, "get root inode failed");
 		ret = PTR_ERR(root);
+		root = NULL;
 		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
 		iput(root);
-		printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
 		goto failed_mount4;
 	}
-	sb->s_root = d_alloc_root(root);
+	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
-		printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
-		iput(root);
+		ext4_msg(sb, KERN_ERR, "get root dentry failed");
 		ret = -ENOMEM;
 		goto failed_mount4;
 	}
 
-	ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+	if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+		sb->s_flags |= MS_RDONLY;
 
 	/* determine the minimum size of new large inodes, if present */
 	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -2289,52 +4115,144 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 							sbi->s_inode_size) {
 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
 						       EXT4_GOOD_OLD_INODE_SIZE;
-		printk(KERN_INFO "EXT4-fs: required extra inode space not"
-			"available.\n");
+		ext4_msg(sb, KERN_INFO, "required extra inode space not"
+			 "available");
 	}
 
-	/*
-	 * akpm: core read_super() calls in here with the superblock locked.
-	 * That deadlocks, because orphan cleanup needs to lock the superblock
-	 * in numerous places.  Here we just pop the lock - it's relatively
-	 * harmless, because we are now ready to accept write_super() requests,
-	 * and aviro says that's the only reason for hanging onto the
-	 * superblock lock.
-	 */
+	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+			 "reserved pool", ext4_calculate_resv_clusters(sb));
+		goto failed_mount4a;
+	}
+
+	err = ext4_setup_system_zone(sb);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to initialize system "
+			 "zone (%d)", err);
+		goto failed_mount4a;
+	}
+
+	ext4_ext_init(sb);
+	err = ext4_mb_init(sb);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
+			 err);
+		goto failed_mount5;
+	}
+
+	err = ext4_register_li_request(sb, first_not_zeroed);
+	if (err)
+		goto failed_mount6;
+
+	sbi->s_kobj.kset = ext4_kset;
+	init_completion(&sbi->s_kobj_unregister);
+	err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+				   "%s", sb->s_id);
+	if (err)
+		goto failed_mount7;
+
+#ifdef CONFIG_QUOTA
+	/* Enable quota usage during mount. */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		err = ext4_enable_quotas(sb);
+		if (err)
+			goto failed_mount8;
+	}
+#endif  /* CONFIG_QUOTA */
+
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
-	if (needs_recovery)
-		printk (KERN_INFO "EXT4-fs: recovery complete.\n");
-	ext4_mark_recovery_complete(sb, es);
-	printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
-		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
-		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
-		"writeback");
+	if (needs_recovery) {
+		ext4_msg(sb, KERN_INFO, "recovery complete");
+		ext4_mark_recovery_complete(sb, es);
+	}
+	if (EXT4_SB(sb)->s_journal) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+			descr = " journalled data mode";
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+			descr = " ordered data mode";
+		else
+			descr = " writeback data mode";
+	} else
+		descr = "out journal";
+
+	if (test_opt(sb, DISCARD)) {
+		struct request_queue *q = bdev_get_queue(sb->s_bdev);
+		if (!blk_queue_discard(q))
+			ext4_msg(sb, KERN_WARNING,
+				 "mounting with \"discard\" option, but "
+				 "the device does not support discard");
+	}
 
-	ext4_ext_init(sb);
-	ext4_mb_init(sb, needs_recovery);
+	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
 
-	lock_kernel();
+	if (es->s_error_count)
+		mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
+
+	/* Enable message ratelimiting. Default is 10 messages per 5 secs. */
+	ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
+	ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
+	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+
+	kfree(orig_data);
 	return 0;
 
 cantfind_ext4:
 	if (!silent)
-		printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
-		       sb->s_id);
+		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
 	goto failed_mount;
 
+#ifdef CONFIG_QUOTA
+failed_mount8:
+	kobject_del(&sbi->s_kobj);
+#endif
+failed_mount7:
+	ext4_unregister_li_request(sb);
+failed_mount6:
+	ext4_mb_release(sb);
+failed_mount5:
+	ext4_ext_release(sb);
+	ext4_release_system_zone(sb);
+failed_mount4a:
+	dput(sb->s_root);
+	sb->s_root = NULL;
 failed_mount4:
-	jbd2_journal_destroy(sbi->s_journal);
+	ext4_msg(sb, KERN_ERR, "mount failed");
+	if (EXT4_SB(sb)->rsv_conversion_wq)
+		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+failed_mount_wq:
+	if (sbi->s_journal) {
+		jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+	}
 failed_mount3:
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	ext4_es_unregister_shrinker(sbi);
+	del_timer_sync(&sbi->s_err_report);
+	if (sbi->s_flex_groups)
+		ext4_kvfree(sbi->s_flex_groups);
+	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+	if (sbi->s_mmp_tsk)
+		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
-	kfree(sbi->s_group_desc);
+	ext4_kvfree(sbi->s_group_desc);
 failed_mount:
+	if (sbi->s_chksum_driver)
+		crypto_free_shash(sbi->s_chksum_driver);
+	if (sbi->s_proc) {
+		remove_proc_entry("options", sbi->s_proc);
+		remove_proc_entry(sb->s_id, ext4_proc_root);
+	}
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -2343,9 +4261,11 @@ failed_mount:
 	brelse(bh);
 out_fail:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	lock_kernel();
-	return ret;
+out_free_orig:
+	kfree(orig_data);
+	return err ? err : ret;
 }
 
 /*
@@ -2357,18 +4277,20 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (sbi->s_commit_interval)
-		journal->j_commit_interval = sbi->s_commit_interval;
-	/* We could also set up an ext4-specific default for the commit
-	 * interval here, but for now we'll just fall back to the jbd
-	 * default. */
+	journal->j_commit_interval = sbi->s_commit_interval;
+	journal->j_min_batch_time = sbi->s_min_batch_time;
+	journal->j_max_batch_time = sbi->s_max_batch_time;
 
-	spin_lock(&journal->j_state_lock);
+	write_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
 		journal->j_flags |= JBD2_BARRIER;
 	else
 		journal->j_flags &= ~JBD2_BARRIER;
-	spin_unlock(&journal->j_state_lock);
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
+	write_unlock(&journal->j_state_lock);
 }
 
 static journal_t *ext4_get_journal(struct super_block *sb,
@@ -2377,33 +4299,35 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 	struct inode *journal_inode;
 	journal_t *journal;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	/* First, test for the existence of a valid inode on disk.  Bad
 	 * things happen if we iget() an unused inode, as the subsequent
 	 * iput() will try to delete it. */
 
 	journal_inode = ext4_iget(sb, journal_inum);
 	if (IS_ERR(journal_inode)) {
-		printk(KERN_ERR "EXT4-fs: no journal found.\n");
+		ext4_msg(sb, KERN_ERR, "no journal found");
 		return NULL;
 	}
 	if (!journal_inode->i_nlink) {
 		make_bad_inode(journal_inode);
 		iput(journal_inode);
-		printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
 		return NULL;
 	}
 
-	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
-		printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+		ext4_msg(sb, KERN_ERR, "invalid journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
 
 	journal = jbd2_journal_init_inode(journal_inode);
 	if (!journal) {
-		printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
@@ -2415,32 +4339,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 static journal_t *ext4_get_dev_journal(struct super_block *sb,
 				       dev_t j_dev)
 {
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	journal_t *journal;
 	ext4_fsblk_t start;
 	ext4_fsblk_t len;
 	int hblock, blocksize;
 	ext4_fsblk_t sb_block;
 	unsigned long offset;
-	struct ext4_super_block * es;
+	struct ext4_super_block *es;
 	struct block_device *bdev;
 
-	bdev = ext4_blkdev_get(j_dev);
-	if (bdev == NULL)
-		return NULL;
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 
-	if (bd_claim(bdev, sb)) {
-		printk(KERN_ERR
-			"EXT4: failed to claim external journal device.\n");
-		blkdev_put(bdev);
+	bdev = ext4_blkdev_get(j_dev, sb);
+	if (bdev == NULL)
 		return NULL;
-	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
-		printk(KERN_ERR
-			"EXT4-fs: blocksize too small for journal device.\n");
+		ext4_msg(sb, KERN_ERR,
+			"blocksize too small for journal device");
 		goto out_bdev;
 	}
 
@@ -2448,23 +4367,23 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
 	set_blocksize(bdev, blocksize);
 	if (!(bh = __bread(bdev, sb_block, blocksize))) {
-		printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
-		       "external journal\n");
+		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+		       "external journal");
 		goto out_bdev;
 	}
 
-	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+	es = (struct ext4_super_block *) (bh->b_data + offset);
 	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
 	    !(le32_to_cpu(es->s_feature_incompat) &
 	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-		printk(KERN_ERR "EXT4-fs: external journal has "
-					"bad superblock\n");
+		ext4_msg(sb, KERN_ERR, "external journal has "
+					"bad superblock");
 		brelse(bh);
 		goto out_bdev;
 	}
 
 	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-		printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
 		brelse(bh);
 		goto out_bdev;
 	}
@@ -2476,25 +4395,26 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
 					start, len, blocksize);
 	if (!journal) {
-		printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+		ext4_msg(sb, KERN_ERR, "failed to create device journal");
 		goto out_bdev;
 	}
 	journal->j_private = sb;
-	ll_rw_block(READ, 1, &journal->j_sb_buffer);
+	ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
 	wait_on_buffer(journal->j_sb_buffer);
 	if (!buffer_uptodate(journal->j_sb_buffer)) {
-		printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
 		goto out_journal;
 	}
 	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-		printk(KERN_ERR "EXT4-fs: External journal has more than one "
-					"user (unsupported) - %d\n",
+		ext4_msg(sb, KERN_ERR, "External journal has more than one "
+					"user (unsupported) - %d",
 			be32_to_cpu(journal->j_superblock->s_nr_users));
 		goto out_journal;
 	}
 	EXT4_SB(sb)->journal_bdev = bdev;
 	ext4_init_journal_params(sb, journal);
 	return journal;
+
 out_journal:
 	jbd2_journal_destroy(journal);
 out_bdev:
@@ -2512,10 +4432,12 @@ static int ext4_load_journal(struct super_block *sb,
 	int err = 0;
 	int really_read_only;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-		printk(KERN_INFO "EXT4-fs: external journal device major/minor "
-			"numbers have changed\n");
+		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+			"numbers have changed");
 		journal_dev = new_decode_dev(journal_devnum);
 	} else
 		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -2527,24 +4449,23 @@ static int ext4_load_journal(struct super_block *sb,
 	 * crash?  For recovery, we need to check in advance whether we
 	 * can get read-write access to the device.
 	 */
-
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		if (sb->s_flags & MS_RDONLY) {
-			printk(KERN_INFO "EXT4-fs: INFO: recovery "
-					"required on readonly filesystem.\n");
+			ext4_msg(sb, KERN_INFO, "INFO: recovery "
+					"required on readonly filesystem");
 			if (really_read_only) {
-				printk(KERN_ERR "EXT4-fs: write access "
-					"unavailable, cannot proceed.\n");
+				ext4_msg(sb, KERN_ERR, "write access "
+					"unavailable, cannot proceed");
 				return -EROFS;
 			}
-			printk (KERN_INFO "EXT4-fs: write access will "
-					"be enabled during recovery.\n");
+			ext4_msg(sb, KERN_INFO, "write access will "
+			       "be enabled during recovery");
 		}
 	}
 
 	if (journal_inum && journal_dev) {
-		printk(KERN_ERR "EXT4-fs: filesystem has both journal "
-		       "and inode journals!\n");
+		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+		       "and inode journals!");
 		return -EINVAL;
 	}
 
@@ -2556,22 +4477,25 @@ static int ext4_load_journal(struct super_block *sb,
 			return -EINVAL;
 	}
 
-	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
-		err = jbd2_journal_update_format(journal);
-		if (err)  {
-			printk(KERN_ERR "EXT4-fs: error updating journal.\n");
-			jbd2_journal_destroy(journal);
-			return err;
-		}
-	}
+	if (!(journal->j_flags & JBD2_BARRIER))
+		ext4_msg(sb, KERN_INFO, "barriers disabled");
 
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
 		err = jbd2_journal_wipe(journal, !really_read_only);
-	if (!err)
+	if (!err) {
+		char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+		if (save)
+			memcpy(save, ((char *) es) +
+			       EXT4_S_ERR_START, EXT4_S_ERR_LEN);
 		err = jbd2_journal_load(journal);
+		if (save)
+			memcpy(((char *) es) + EXT4_S_ERR_START,
+			       save, EXT4_S_ERR_LEN);
+		kfree(save);
+	}
 
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+		ext4_msg(sb, KERN_ERR, "error loading journal");
 		jbd2_journal_destroy(journal);
 		return err;
 	}
@@ -2579,98 +4503,109 @@ static int ext4_load_journal(struct super_block *sb,
 	EXT4_SB(sb)->s_journal = journal;
 	ext4_clear_journal_err(sb, es);
 
-	if (journal_devnum &&
+	if (!really_read_only && journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
-		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
-		ext4_commit_super(sb, es, 1);
-	}
-
-	return 0;
-}
-
-static int ext4_create_journal(struct super_block * sb,
-			       struct ext4_super_block * es,
-			       unsigned int journal_inum)
-{
-	journal_t *journal;
-	int err;
-
-	if (sb->s_flags & MS_RDONLY) {
-		printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
-				"create journal.\n");
-		return -EROFS;
-	}
-
-	journal = ext4_get_journal(sb, journal_inum);
-	if (!journal)
-		return -EINVAL;
-
-	printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
-	       journal_inum);
-
-	err = jbd2_journal_create(journal);
-	if (err) {
-		printk(KERN_ERR "EXT4-fs: error creating journal.\n");
-		jbd2_journal_destroy(journal);
-		return -EIO;
+		ext4_commit_super(sb, 1);
 	}
 
-	EXT4_SB(sb)->s_journal = journal;
-
-	ext4_update_dynamic_rev(sb);
-	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-	EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
-
-	es->s_journal_inum = cpu_to_le32(journal_inum);
-	sb->s_dirt = 1;
-
-	/* Make sure we flush the recovery flag to disk. */
-	ext4_commit_super(sb, es, 1);
-
 	return 0;
 }
 
-static void ext4_commit_super (struct super_block * sb,
-			       struct ext4_super_block * es,
-			       int sync)
+static int ext4_commit_super(struct super_block *sb, int sync)
 {
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+	int error = 0;
 
-	if (!sbh)
-		return;
-	es->s_wtime = cpu_to_le32(get_seconds());
-	ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
-	es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+	if (!sbh || block_device_ejected(sb))
+		return error;
+	if (buffer_write_io_error(sbh)) {
+		/*
+		 * Oh, dear.  A previous attempt to write the
+		 * superblock failed.  This could happen because the
+		 * USB device was yanked out.  Or it could happen to
+		 * be a transient write error and maybe the block will
+		 * be remapped.  Nothing we can do but to retry the
+		 * write and hope for the best.
+		 */
+		ext4_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
+		clear_buffer_write_io_error(sbh);
+		set_buffer_uptodate(sbh);
+	}
+	/*
+	 * If the file system is mounted read-only, don't update the
+	 * superblock write time.  This avoids updating the superblock
+	 * write time when we are mounting the root file system
+	 * read/only but we need to replay the journal; at that point,
+	 * for people who are east of GMT and who make their clock
+	 * tick in localtime for Windows bug-for-bug compatibility,
+	 * the clock is set in the future, and this will cause e2fsck
+	 * to complain and force a full file system check.
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		es->s_wtime = cpu_to_le32(get_seconds());
+	if (sb->s_bdev->bd_part)
+		es->s_kbytes_written =
+			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
+	else
+		es->s_kbytes_written =
+			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
+	ext4_free_blocks_count_set(es,
+			EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
+				&EXT4_SB(sb)->s_freeclusters_counter)));
+	es->s_free_inodes_count =
+		cpu_to_le32(percpu_counter_sum_positive(
+				&EXT4_SB(sb)->s_freeinodes_counter));
 	BUFFER_TRACE(sbh, "marking dirty");
+	ext4_superblock_csum_set(sb);
 	mark_buffer_dirty(sbh);
-	if (sync)
-		sync_dirty_buffer(sbh);
+	if (sync) {
+		error = sync_dirty_buffer(sbh);
+		if (error)
+			return error;
+
+		error = buffer_write_io_error(sbh);
+		if (error) {
+			ext4_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
+			clear_buffer_write_io_error(sbh);
+			set_buffer_uptodate(sbh);
+		}
+	}
+	return error;
 }
 
-
 /*
  * Have we just finished recovery?  If so, and if we are mounting (or
  * remounting) the filesystem readonly, then we will end up with a
  * consistent fs on disk.  Record that fact.
  */
-static void ext4_mark_recovery_complete(struct super_block * sb,
-					struct ext4_super_block * es)
+static void ext4_mark_recovery_complete(struct super_block *sb,
+					struct ext4_super_block *es)
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		BUG_ON(journal != NULL);
+		return;
+	}
 	jbd2_journal_lock_updates(journal);
-	jbd2_journal_flush(journal);
-	lock_super(sb);
+	if (jbd2_journal_flush(journal) < 0)
+		goto out;
+
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		sb->s_dirt = 0;
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
-	unlock_super(sb);
+
+out:
 	jbd2_journal_unlock_updates(journal);
 }
 
@@ -2679,13 +4614,15 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
  * has recorded an error from a previous lifetime, move that error to the
  * main filesystem now.
  */
-static void ext4_clear_journal_err(struct super_block * sb,
-				   struct ext4_super_block * es)
+static void ext4_clear_journal_err(struct super_block *sb,
+				   struct ext4_super_block *es)
 {
 	journal_t *journal;
 	int j_errno;
 	const char *errstr;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	journal = EXT4_SB(sb)->s_journal;
 
 	/*
@@ -2698,16 +4635,16 @@ static void ext4_clear_journal_err(struct super_block * sb,
 		char nbuf[16];
 
 		errstr = ext4_decode_error(sb, j_errno, nbuf);
-		ext4_warning(sb, __FUNCTION__, "Filesystem error recorded "
+		ext4_warning(sb, "Filesystem error recorded "
 			     "from previous mount: %s", errstr);
-		ext4_warning(sb, __FUNCTION__, "Marking fs in need of "
-			     "filesystem check.");
+		ext4_warning(sb, "Marking fs in need of filesystem check.");
 
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-		ext4_commit_super (sb, es, 1);
+		ext4_commit_super(sb, 1);
 
 		jbd2_journal_clear_err(journal);
+		jbd2_journal_update_sb_errno(journal);
 	}
 }
 
@@ -2718,132 +4655,226 @@ static void ext4_clear_journal_err(struct super_block * sb,
 int ext4_force_commit(struct super_block *sb)
 {
 	journal_t *journal;
-	int ret;
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	sb->s_dirt = 0;
-	ret = ext4_journal_force_commit(journal);
-	return ret;
-}
-
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  Just start an async writeback to get the buffers on their way
- * to the disk.
- *
- * This implicitly triggers the writebehind on sync().
- */
-
-static void ext4_write_super (struct super_block * sb)
-{
-	if (mutex_trylock(&sb->s_lock) != 0)
-		BUG();
-	sb->s_dirt = 0;
+	return ext4_journal_force_commit(journal);
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
+	int ret = 0;
 	tid_t target;
+	bool needs_barrier = false;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	trace_ext4_sync_fs(sb, wait);
+	flush_workqueue(sbi->rsv_conversion_wq);
+	/*
+	 * Writeback quota in non-journalled quota case - journalled quota has
+	 * no dirty dquots
+	 */
+	dquot_writeback_dquots(sb, -1);
+	/*
+	 * Data writeback is possible w/o journal transaction, so barrier must
+	 * being sent at the end of the function. But we can skip it if
+	 * transaction_commit will do it for us.
+	 */
+	target = jbd2_get_latest_transaction(sbi->s_journal);
+	if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
+	    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
+		needs_barrier = true;
 
-	sb->s_dirt = 0;
-	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
 		if (wait)
-			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+			ret = jbd2_log_wait_commit(sbi->s_journal, target);
 	}
-	return 0;
+	if (needs_barrier) {
+		int err;
+		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+		if (!ret)
+			ret = err;
+	}
+
+	return ret;
+}
+
+static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
+{
+	int ret = 0;
+
+	trace_ext4_sync_fs(sb, wait);
+	flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
+	dquot_writeback_dquots(sb, -1);
+	if (wait && test_opt(sb, BARRIER))
+		ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+
+	return ret;
 }
 
 /*
  * LVM calls this function before a (read-only) snapshot is created.  This
  * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently. It relies on upper layer to stop all data & metadata
+ * modifications.
  */
-static void ext4_write_super_lockfs(struct super_block *sb)
+static int ext4_freeze(struct super_block *sb)
 {
-	sb->s_dirt = 0;
+	int error = 0;
+	journal_t *journal;
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		journal_t *journal = EXT4_SB(sb)->s_journal;
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
 
-		/* Now we set up the journal barrier. */
-		jbd2_journal_lock_updates(journal);
-		jbd2_journal_flush(journal);
+	journal = EXT4_SB(sb)->s_journal;
 
-		/* Journal blocked and flushed, clear needs_recovery flag. */
-		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-	}
+	/* Now we set up the journal barrier. */
+	jbd2_journal_lock_updates(journal);
+
+	/*
+	 * Don't clear the needs_recovery flag if we failed to flush
+	 * the journal.
+	 */
+	error = jbd2_journal_flush(journal);
+	if (error < 0)
+		goto out;
+
+	/* Journal blocked and flushed, clear needs_recovery flag. */
+	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	error = ext4_commit_super(sb, 1);
+out:
+	/* we rely on upper layer to stop further updates */
+	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	return error;
 }
 
 /*
  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
  * flag here, even though the filesystem is not technically dirty yet.
  */
-static void ext4_unlockfs(struct super_block *sb)
+static int ext4_unfreeze(struct super_block *sb)
 {
-	if (!(sb->s_flags & MS_RDONLY)) {
-		lock_super(sb);
-		/* Reser the needs_recovery flag before the fs is unlocked. */
-		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-		unlock_super(sb);
-		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-	}
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	/* Reset the needs_recovery flag before the fs is unlocked. */
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	ext4_commit_super(sb, 1);
+	return 0;
 }
 
-static int ext4_remount (struct super_block * sb, int * flags, char * data)
+/*
+ * Structure to save mount options for ext4_remount's benefit
+ */
+struct ext4_mount_options {
+	unsigned long s_mount_opt;
+	unsigned long s_mount_opt2;
+	kuid_t s_resuid;
+	kgid_t s_resgid;
+	unsigned long s_commit_interval;
+	u32 s_min_batch_time, s_max_batch_time;
+#ifdef CONFIG_QUOTA
+	int s_jquota_fmt;
+	char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
+static int ext4_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct ext4_super_block * es;
+	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_fsblk_t n_blocks_count = 0;
 	unsigned long old_sb_flags;
 	struct ext4_mount_options old_opts;
-	int err;
+	int enable_quota = 0;
+	ext4_group_t g;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	int err = 0;
 #ifdef CONFIG_QUOTA
-	int i;
+	int i, j;
 #endif
+	char *orig_data = kstrdup(data, GFP_KERNEL);
 
 	/* Store the original options */
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_mount_opt2 = sbi->s_mount_opt2;
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
+	old_opts.s_min_batch_time = sbi->s_min_batch_time;
+	old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++)
-		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+		if (sbi->s_qf_names[i]) {
+			old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+							 GFP_KERNEL);
+			if (!old_opts.s_qf_names[i]) {
+				for (j = 0; j < i; j++)
+					kfree(old_opts.s_qf_names[j]);
+				kfree(orig_data);
+				return -ENOMEM;
+			}
+		} else
+			old_opts.s_qf_names[i] = NULL;
 #endif
+	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
 
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
 
-	if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
-		ext4_abort(sb, __FUNCTION__, "Abort forced by user");
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and delalloc");
+			err = -EINVAL;
+			goto restore_opts;
+		}
+		if (test_opt(sb, DIOREAD_NOLOCK)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				 "both data=journal and dioread_nolock");
+			err = -EINVAL;
+			goto restore_opts;
+		}
+	}
+
+	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
+		ext4_abort(sb, "Abort forced by user");
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 
 	es = sbi->s_es;
 
-	ext4_init_journal_params(sb, sbi->s_journal);
+	if (sbi->s_journal) {
+		ext4_init_journal_params(sb, sbi->s_journal);
+		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+	}
 
-	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
-		n_blocks_count > ext4_blocks_count(es)) {
-		if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
 			err = -EROFS;
 			goto restore_opts;
 		}
 
 		if (*flags & MS_RDONLY) {
+			err = sync_filesystem(sb);
+			if (err < 0)
+				goto restore_opts;
+			err = dquot_suspend(sb, -1);
+			if (err < 0)
+				goto restore_opts;
+
 			/*
 			 * First of all, the unconditional stuff we have to do
 			 * to disable replay of the journal when we next remount
@@ -2859,24 +4890,31 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
 			    (sbi->s_mount_state & EXT4_VALID_FS))
 				es->s_state = cpu_to_le16(sbi->s_mount_state);
 
-			/*
-			 * We have to unlock super so that we can wait for
-			 * transactions.
-			 */
-			unlock_super(sb);
-			ext4_mark_recovery_complete(sb, es);
-			lock_super(sb);
+			if (sbi->s_journal)
+				ext4_mark_recovery_complete(sb, es);
 		} else {
-			__le32 ret;
-			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
-				       "remount RDWR because of unsupported "
-				       "optional features (%x).\n",
-				       sb->s_id, le32_to_cpu(ret));
+			/* Make sure we can mount this feature set readwrite */
+			if (!ext4_feature_set_ok(sb, 0)) {
 				err = -EROFS;
 				goto restore_opts;
 			}
+			/*
+			 * Make sure the group descriptor checksums
+			 * are sane.  If they aren't, refuse to remount r/w.
+			 */
+			for (g = 0; g < sbi->s_groups_count; g++) {
+				struct ext4_group_desc *gdp =
+					ext4_get_group_desc(sb, g, NULL);
+
+				if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
+					ext4_msg(sb, KERN_ERR,
+	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
+		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+					       le16_to_cpu(gdp->bg_checksum));
+					err = -EINVAL;
+					goto restore_opts;
+				}
+			}
 
 			/*
 			 * If we have an unprocessed orphan list hanging
@@ -2884,11 +4922,10 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
 			 * require a full umount/remount for now.
 			 */
 			if (es->s_last_orphan) {
-				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+				ext4_msg(sb, KERN_WARNING, "Couldn't "
 				       "remount RDWR because of unprocessed "
 				       "orphan inode list.  Please "
-				       "umount/remount instead.\n",
-				       sb->s_id);
+				       "umount/remount instead");
 				err = -EINVAL;
 				goto restore_opts;
 			}
@@ -2899,112 +4936,119 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
 			 * been changed by e2fsck since we originally mounted
 			 * the partition.)
 			 */
-			ext4_clear_journal_err(sb, es);
+			if (sbi->s_journal)
+				ext4_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
-			if ((err = ext4_group_extend(sb, es, n_blocks_count)))
-				goto restore_opts;
-			if (!ext4_setup_super (sb, es, 0))
+			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
+			if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+						     EXT4_FEATURE_INCOMPAT_MMP))
+				if (ext4_multi_mount_protect(sb,
+						le64_to_cpu(es->s_mmp_block))) {
+					err = -EROFS;
+					goto restore_opts;
+				}
+			enable_quota = 1;
 		}
 	}
+
+	/*
+	 * Reinitialize lazy itable initialization thread based on
+	 * current settings
+	 */
+	if ((sb->s_flags & MS_RDONLY) || !test_opt(sb, INIT_INODE_TABLE))
+		ext4_unregister_li_request(sb);
+	else {
+		ext4_group_t first_not_zeroed;
+		first_not_zeroed = ext4_has_uninit_itable(sb);
+		ext4_register_li_request(sb, first_not_zeroed);
+	}
+
+	ext4_setup_system_zone(sb);
+	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
+		ext4_commit_super(sb, 1);
+
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
-		if (old_opts.s_qf_names[i] &&
-		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
-			kfree(old_opts.s_qf_names[i]);
+		kfree(old_opts.s_qf_names[i]);
+	if (enable_quota) {
+		if (sb_any_quota_suspended(sb))
+			dquot_resume(sb, -1);
+		else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+			err = ext4_enable_quotas(sb);
+			if (err)
+				goto restore_opts;
+		}
+	}
 #endif
+
+	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+	kfree(orig_data);
 	return 0;
+
 restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_mount_opt2 = old_opts.s_mount_opt2;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
+	sbi->s_min_batch_time = old_opts.s_min_batch_time;
+	sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
-		if (sbi->s_qf_names[i] &&
-		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
-			kfree(sbi->s_qf_names[i]);
+		kfree(sbi->s_qf_names[i]);
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
+	kfree(orig_data);
 	return err;
 }
 
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t overhead = 0, resv_blocks;
 	u64 fsid;
+	s64 bfree;
+	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
 
-	if (test_opt(sb, MINIX_DF)) {
-		sbi->s_overhead_last = 0;
-	} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-		ext4_group_t ngroups = sbi->s_groups_count, i;
-		ext4_fsblk_t overhead = 0;
-		smp_rmb();
-
-		/*
-		 * Compute the overhead (FS structures).  This is constant
-		 * for a given filesystem unless the number of block groups
-		 * changes so we cache the previous value until it does.
-		 */
-
-		/*
-		 * All of the blocks before first_data_block are
-		 * overhead
-		 */
-		overhead = le32_to_cpu(es->s_first_data_block);
-
-		/*
-		 * Add the overhead attributed to the superblock and
-		 * block group descriptors.  If the sparse superblocks
-		 * feature is turned on, then not all groups have this.
-		 */
-		for (i = 0; i < ngroups; i++) {
-			overhead += ext4_bg_has_super(sb, i) +
-				ext4_bg_num_gdb(sb, i);
-			cond_resched();
-		}
-
-		/*
-		 * Every block group has an inode bitmap, a block
-		 * bitmap, and an inode table.
-		 */
-		overhead += ngroups * (2 + sbi->s_itb_per_group);
-		sbi->s_overhead_last = overhead;
-		smp_wmb();
-		sbi->s_blocks_last = ext4_blocks_count(es);
-	}
+	if (!test_opt(sb, MINIX_DF))
+		overhead = sbi->s_overhead;
 
 	buf->f_type = EXT4_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
-	buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
-	buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
-	ext4_free_blocks_count_set(es, buf->f_bfree);
-	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
-	if (buf->f_bfree < ext4_r_blocks_count(es))
+	buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
+	bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
+		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
+	/* prevent underflow in case that few free space is available */
+	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
+	buf->f_bavail = buf->f_bfree -
+			(ext4_r_blocks_count(es) + resv_blocks);
+	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
-	es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
 	buf->f_namelen = EXT4_NAME_LEN;
 	fsid = le64_to_cpup((void *)es->s_uuid) ^
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
 	return 0;
 }
 
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
- * is locked for write. Otherwise the are possible deadlocks:
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
- *   jbd2_journal_start()                   write_dquot()
- *   DQUOT_INIT()                        down(dqio_mutex)
+ *   jbd2_journal_start()                  write_dquot()
+ *   dquot_initialize()                         down(dqio_mutex)
  *     down(dqio_mutex)                    jbd2_journal_start()
  *
  */
@@ -3013,39 +5057,7 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
 
 static inline struct inode *dquot_to_inode(struct dquot *dquot)
 {
-	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
-}
-
-static int ext4_dquot_initialize(struct inode *inode, int type)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may create quota structure so we need to reserve enough blocks */
-	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	ret = dquot_initialize(inode, type);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-static int ext4_dquot_drop(struct inode *inode)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may delete quota structure so we need to reserve enough blocks */
-	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	ret = dquot_drop(inode);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
+	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
 }
 
 static int ext4_write_dquot(struct dquot *dquot)
@@ -3055,8 +5067,8 @@ static int ext4_write_dquot(struct dquot *dquot)
 	struct inode *inode;
 
 	inode = dquot_to_inode(dquot);
-	handle = ext4_journal_start(inode,
-					EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit(dquot);
@@ -3071,8 +5083,8 @@ static int ext4_acquire_dquot(struct dquot *dquot)
 	int ret, err;
 	handle_t *handle;
 
-	handle = ext4_journal_start(dquot_to_inode(dquot),
-					EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
+				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_acquire(dquot);
@@ -3087,8 +5099,8 @@ static int ext4_release_dquot(struct dquot *dquot)
 	int ret, err;
 	handle_t *handle;
 
-	handle = ext4_journal_start(dquot_to_inode(dquot),
-					EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
+				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle)) {
 		/* Release dquot anyway to avoid endless cycle in dqput() */
 		dquot_release(dquot);
@@ -3103,9 +5115,12 @@ static int ext4_release_dquot(struct dquot *dquot)
 
 static int ext4_mark_dquot_dirty(struct dquot *dquot)
 {
-	/* Are we journalling quotas? */
-	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
-	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+	struct super_block *sb = dquot->dq_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	/* Are we journaling quotas? */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+	    sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
 		return ext4_write_dquot(dquot);
 	} else {
@@ -3119,7 +5134,7 @@ static int ext4_write_info(struct super_block *sb, int type)
 	handle_t *handle;
 
 	/* Data block + inode block */
-	handle = ext4_journal_start(sb->s_root->d_inode, 2);
+	handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit_info(sb, type);
@@ -3135,45 +5150,164 @@ static int ext4_write_info(struct super_block *sb, int type)
  */
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
-	return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-			EXT4_SB(sb)->s_jquota_fmt, type);
+	return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+					EXT4_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
  * Standard function to be called on quota_on
  */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-			 char *path)
+			 struct path *path)
 {
 	int err;
-	struct nameidata nd;
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* Not journalling quota? */
-	if (!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
-	    !EXT4_SB(sb)->s_qf_names[GRPQUOTA])
-		return vfs_quota_on(sb, type, format_id, path);
-	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
-	if (err)
-		return err;
+
 	/* Quotafile not on the same filesystem? */
-	if (nd.path.mnt->mnt_sb != sb) {
-		path_put(&nd.path);
+	if (path->dentry->d_sb != sb)
 		return -EXDEV;
+	/* Journaling quota? */
+	if (EXT4_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not in fs root? */
+		if (path->dentry->d_parent != sb->s_root)
+			ext4_msg(sb, KERN_WARNING,
+				"Quota file not on filesystem root. "
+				"Journaled quota will not work");
 	}
-	/* Quotafile not of fs root? */
-	if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-		printk(KERN_WARNING
-			"EXT4-fs: Quota file not on filesystem root. "
-			"Journalled quota will not work.\n");
-	path_put(&nd.path);
-	return vfs_quota_on(sb, type, format_id, path);
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (EXT4_SB(sb)->s_journal &&
+	    ext4_should_journal_data(path->dentry->d_inode)) {
+		/*
+		 * We don't need to lock updates but journal_flush() could
+		 * otherwise be livelocked...
+		 */
+		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		if (err)
+			return err;
+	}
+
+	return dquot_quota_on(sb, type, format_id, path);
+}
+
+static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
+			     unsigned int flags)
+{
+	int err;
+	struct inode *qf_inode;
+	unsigned long qf_inums[MAXQUOTAS] = {
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+	};
+
+	BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+
+	if (!qf_inums[type])
+		return -EPERM;
+
+	qf_inode = ext4_iget(sb, qf_inums[type]);
+	if (IS_ERR(qf_inode)) {
+		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+		return PTR_ERR(qf_inode);
+	}
+
+	/* Don't account quota for quota files to avoid recursion */
+	qf_inode->i_flags |= S_NOQUOTA;
+	err = dquot_enable(qf_inode, type, format_id, flags);
+	iput(qf_inode);
+
+	return err;
+}
+
+/* Enable usage tracking for all quota types. */
+static int ext4_enable_quotas(struct super_block *sb)
+{
+	int type, err = 0;
+	unsigned long qf_inums[MAXQUOTAS] = {
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+	};
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (qf_inums[type]) {
+			err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
+						DQUOT_USAGE_ENABLED);
+			if (err) {
+				ext4_warning(sb,
+					"Failed to enable quota tracking "
+					"(type=%d, err=%d). Please run "
+					"e2fsck to fix.", type, err);
+				return err;
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * quota_on function that is used when QUOTA feature is set.
+ */
+static int ext4_quota_on_sysfile(struct super_block *sb, int type,
+				 int format_id)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+		return -EINVAL;
+
+	/*
+	 * USAGE was enabled at mount time. Only need to enable LIMITS now.
+	 */
+	return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
+}
+
+static int ext4_quota_off(struct super_block *sb, int type)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	handle_t *handle;
+
+	/* Force all delayed allocation blocks to be allocated.
+	 * Caller already holds s_umount sem */
+	if (test_opt(sb, DELALLOC))
+		sync_filesystem(sb);
+
+	if (!inode)
+		goto out;
+
+	/* Update modification times of quota files when userspace can
+	 * start looking at them */
+	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
+	if (IS_ERR(handle))
+		goto out;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+
+out:
+	return dquot_quota_off(sb, type);
+}
+
+/*
+ * quota_off function that is used when QUOTA feature is set.
+ */
+static int ext4_quota_off_sysfile(struct super_block *sb, int type)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+		return -EINVAL;
+
+	/* Disable only the limits. */
+	return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
 }
 
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
  * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
  * we don't have to be afraid of races */
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off)
@@ -3220,117 +5354,259 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
 	int err = 0;
 	int offset = off & (sb->s_blocksize - 1);
-	int tocopy;
-	int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
-	size_t towrite = len;
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
-	if (!handle) {
-		printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)"
-			" cancelled because transaction is not started.\n",
+	if (EXT4_SB(sb)->s_journal && !handle) {
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because transaction is not started",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-	while (towrite > 0) {
-		tocopy = sb->s_blocksize - offset < towrite ?
-				sb->s_blocksize - offset : towrite;
-		bh = ext4_bread(handle, inode, blk, 1, &err);
-		if (!bh)
-			goto out;
-		if (journal_quota) {
-			err = ext4_journal_get_write_access(handle, bh);
-			if (err) {
-				brelse(bh);
-				goto out;
-			}
-		}
-		lock_buffer(bh);
-		memcpy(bh->b_data+offset, data, tocopy);
-		flush_dcache_page(bh->b_page);
-		unlock_buffer(bh);
-		if (journal_quota)
-			err = ext4_journal_dirty_metadata(handle, bh);
-		else {
-			/* Always do at least ordered writes for quotas */
-			err = ext4_journal_dirty_data(handle, bh);
-			mark_buffer_dirty(bh);
-		}
+	/*
+	 * Since we account only one data block in transaction credits,
+	 * then it is impossible to cross a block boundary.
+	 */
+	if (sb->s_blocksize - offset < len) {
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because not block aligned",
+			(unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+
+	bh = ext4_bread(handle, inode, blk, 1, &err);
+	if (!bh)
+		goto out;
+	BUFFER_TRACE(bh, "get write access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
 		brelse(bh);
-		if (err)
-			goto out;
-		offset = 0;
-		towrite -= tocopy;
-		data += tocopy;
-		blk++;
+		goto out;
 	}
+	lock_buffer(bh);
+	memcpy(bh->b_data+offset, data, len);
+	flush_dcache_page(bh->b_page);
+	unlock_buffer(bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bh);
+	brelse(bh);
 out:
-	if (len == towrite)
+	if (err)
 		return err;
-	if (inode->i_size < off+len-towrite) {
-		i_size_write(inode, off+len-towrite);
+	if (inode->i_size < off + len) {
+		i_size_write(inode, off + len);
 		EXT4_I(inode)->i_disksize = inode->i_size;
+		ext4_mark_inode_dirty(handle, inode);
 	}
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	ext4_mark_inode_dirty(handle, inode);
-	mutex_unlock(&inode->i_mutex);
-	return len - towrite;
+	return len;
 }
 
 #endif
 
-static int ext4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
+}
+
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext2(void)
+{
+	int err = register_filesystem(&ext2_fs_type);
+	if (err)
+		printk(KERN_WARNING
+		       "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+
+static inline void unregister_as_ext2(void)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+	unregister_filesystem(&ext2_fs_type);
 }
 
-static struct file_system_type ext4dev_fs_type = {
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+		return 0;
+	if (sb->s_flags & MS_RDONLY)
+		return 1;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+		return 0;
+	return 1;
+}
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static inline void register_as_ext3(void)
+{
+	int err = register_filesystem(&ext3_fs_type);
+	if (err)
+		printk(KERN_WARNING
+		       "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+
+static inline void unregister_as_ext3(void)
+{
+	unregister_filesystem(&ext3_fs_type);
+}
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+		return 0;
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+		return 0;
+	if (sb->s_flags & MS_RDONLY)
+		return 1;
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+		return 0;
+	return 1;
+}
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
+#endif
+
+static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
-	.name		= "ext4dev",
-	.get_sb		= ext4_get_sb,
+	.name		= "ext4",
+	.mount		= ext4_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+MODULE_ALIAS_FS("ext4");
 
-static int __init init_ext4_fs(void)
+static int __init ext4_init_feat_adverts(void)
 {
-	int err;
+	struct ext4_features *ef;
+	int ret = -ENOMEM;
+
+	ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
+	if (!ef)
+		goto out;
 
-	err = init_ext4_mballoc();
+	ef->f_kobj.kset = ext4_kset;
+	init_completion(&ef->f_kobj_unregister);
+	ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
+				   "features");
+	if (ret) {
+		kfree(ef);
+		goto out;
+	}
+
+	ext4_feat = ef;
+	ret = 0;
+out:
+	return ret;
+}
+
+static void ext4_exit_feat_adverts(void)
+{
+	kobject_put(&ext4_feat->f_kobj);
+	wait_for_completion(&ext4_feat->f_kobj_unregister);
+	kfree(ext4_feat);
+}
+
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
+static int __init ext4_init_fs(void)
+{
+	int i, err;
+
+	ext4_li_info = NULL;
+	mutex_init(&ext4_li_mtx);
+
+	/* Build-time check for flags consistency */
+	ext4_check_flag_values();
+
+	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+		mutex_init(&ext4__aio_mutex[i]);
+		init_waitqueue_head(&ext4__ioend_wq[i]);
+	}
+
+	err = ext4_init_es();
 	if (err)
 		return err;
 
-	err = init_ext4_xattr();
+	err = ext4_init_pageio();
+	if (err)
+		goto out7;
+
+	err = ext4_init_system_zone();
+	if (err)
+		goto out6;
+	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+	if (!ext4_kset) {
+		err = -ENOMEM;
+		goto out5;
+	}
+	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+
+	err = ext4_init_feat_adverts();
+	if (err)
+		goto out4;
+
+	err = ext4_init_mballoc();
 	if (err)
 		goto out2;
+	else
+		ext4_mballoc_ready = 1;
 	err = init_inodecache();
 	if (err)
 		goto out1;
-	err = register_filesystem(&ext4dev_fs_type);
+	register_as_ext3();
+	register_as_ext2();
+	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
+
 	return 0;
 out:
+	unregister_as_ext2();
+	unregister_as_ext3();
 	destroy_inodecache();
 out1:
-	exit_ext4_xattr();
+	ext4_mballoc_ready = 0;
+	ext4_exit_mballoc();
 out2:
-	exit_ext4_mballoc();
+	ext4_exit_feat_adverts();
+out4:
+	if (ext4_proc_root)
+		remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
+out5:
+	ext4_exit_system_zone();
+out6:
+	ext4_exit_pageio();
+out7:
+	ext4_exit_es();
+
 	return err;
 }
 
-static void __exit exit_ext4_fs(void)
+static void __exit ext4_exit_fs(void)
 {
-	unregister_filesystem(&ext4dev_fs_type);
+	ext4_destroy_lazyinit_thread();
+	unregister_as_ext2();
+	unregister_as_ext3();
+	unregister_filesystem(&ext4_fs_type);
 	destroy_inodecache();
-	exit_ext4_xattr();
-	exit_ext4_mballoc();
+	ext4_exit_mballoc();
+	ext4_exit_feat_adverts();
+	remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
+	ext4_exit_system_zone();
+	ext4_exit_pageio();
+	ext4_exit_es();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
-module_init(init_ext4_fs)
-module_exit(exit_ext4_fs)
+module_init(ext4_init_fs)
+module_exit(ext4_exit_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e6f9da4287c..ff371193201 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -19,14 +19,14 @@
 
 #include <linux/fs.h>
 #include <linux/jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/namei.h>
+#include "ext4.h"
 #include "xattr.h"
 
-static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
-	nd_set_link(nd, (char*)ei->i_data);
+	nd_set_link(nd, (char *) ei->i_data);
 	return NULL;
 }
 
@@ -34,21 +34,19 @@ const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
-#endif
 };
 
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
-#endif
 };
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
new file mode 100644
index 00000000000..011ba6670d9
--- /dev/null
+++ b/fs/ext4/truncate.h
@@ -0,0 +1,43 @@
+/*
+ * linux/fs/ext4/truncate.h
+ *
+ * Common inline functions needed for truncate support
+ */
+
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static inline void ext4_truncate_failed_write(struct inode *inode)
+{
+	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	ext4_truncate(inode);
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
+{
+	ext4_lblk_t needed;
+
+	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+	/* Give ourselves just enough room to cope with inodes in which
+	 * i_blocks is corrupt: we've seen disk corruptions in the past
+	 * which resulted in random data in an inode which looked enough
+	 * like a regular file for ext4 to try to delete it.  Things
+	 * will go a bit crazy if that happens, but at least we should
+	 * try not to panic the whole kernel. */
+	if (needed < 2)
+		needed = 2;
+
+	/* But we need to bound the transaction so we don't overflow the
+	 * journal. */
+	if (needed > EXT4_MAX_TRANS_DATA)
+		needed = EXT4_MAX_TRANS_DATA;
+
+	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d7962139c01..e7387337060 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -53,19 +53,14 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
 #include <linux/rwsem.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 #include "acl.h"
 
-#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
-#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
-#define BFIRST(bh) ENTRY(BHDR(bh)+1)
-#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-
 #ifdef EXT4_XATTR_DEBUG
 # define ea_idebug(inode, f...) do { \
 		printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -82,48 +77,101 @@
 		printk("\n"); \
 	} while (0)
 #else
-# define ea_idebug(f...)
-# define ea_bdebug(f...)
+# define ea_idebug(inode, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+# define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
-static void ext4_xattr_cache_insert(struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
 static struct buffer_head *ext4_xattr_cache_find(struct inode *,
 						 struct ext4_xattr_header *,
 						 struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
 			      struct ext4_xattr_entry *);
+static int ext4_xattr_list(struct dentry *dentry, char *buffer,
+			   size_t buffer_size);
 
-static struct mb_cache *ext4_xattr_cache;
-
-static struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
-	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &posix_acl_access_xattr_handler,
+	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
 #endif
 	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 	[EXT4_XATTR_INDEX_SECURITY]	     = &ext4_xattr_security_handler,
 #endif
 };
 
-struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler *ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
 	&ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
-	&ext4_xattr_acl_access_handler,
-	&ext4_xattr_acl_default_handler,
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
 #endif
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 	&ext4_xattr_security_handler,
 #endif
 	NULL
 };
 
-static inline struct xattr_handler *
+#define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
+				inode->i_sb->s_fs_info)->s_mb_cache)
+
+static __le32 ext4_xattr_block_csum(struct inode *inode,
+				    sector_t block_nr,
+				    struct ext4_xattr_header *hdr)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	__u32 csum;
+	__le32 save_csum;
+	__le64 dsk_block_nr = cpu_to_le64(block_nr);
+
+	save_csum = hdr->h_checksum;
+	hdr->h_checksum = 0;
+	csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+			   sizeof(dsk_block_nr));
+	csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
+			   EXT4_BLOCK_SIZE(inode->i_sb));
+
+	hdr->h_checksum = save_csum;
+	return cpu_to_le32(csum);
+}
+
+static int ext4_xattr_block_csum_verify(struct inode *inode,
+					sector_t block_nr,
+					struct ext4_xattr_header *hdr)
+{
+	if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+	    (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
+		return 0;
+	return 1;
+}
+
+static void ext4_xattr_block_csum_set(struct inode *inode,
+				      sector_t block_nr,
+				      struct ext4_xattr_header *hdr)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+		EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+		return;
+
+	hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
+}
+
+static inline int ext4_handle_dirty_xattr_block(handle_t *handle,
+						struct inode *inode,
+						struct buffer_head *bh)
+{
+	ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh));
+	return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static inline const struct xattr_handler *
 ext4_xattr_handler(int name_index)
 {
-	struct xattr_handler *handler = NULL;
+	const struct xattr_handler *handler = NULL;
 
 	if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
 		handler = ext4_xattr_handler_map[name_index];
@@ -138,7 +186,7 @@ ext4_xattr_handler(int name_index)
 ssize_t
 ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-	return ext4_xattr_list(dentry->d_inode, buffer, size);
+	return ext4_xattr_list(dentry, buffer, size);
 }
 
 static int
@@ -154,14 +202,21 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
 }
 
 static inline int
-ext4_xattr_check_block(struct buffer_head *bh)
+ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
 {
 	int error;
 
+	if (buffer_verified(bh))
+		return 0;
+
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1))
 		return -EIO;
+	if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
+		return -EIO;
 	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+	if (!error)
+		set_buffer_verified(bh);
 	return error;
 }
 
@@ -211,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	struct ext4_xattr_entry *entry;
 	size_t size;
 	int error;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
 		  name_index, name, buffer, (long)buffer_size);
@@ -218,20 +274,21 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	error = -ENODATA;
 	if (!EXT4_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %llu",
+		  (unsigned long long)EXT4_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh)
 		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
-	if (ext4_xattr_check_block(bh)) {
-bad_block:	ext4_error(inode->i_sb, __FUNCTION__,
-			   "inode %lu: bad block %llu", inode->i_ino,
-			   EXT4_I(inode)->i_file_acl);
+	if (ext4_xattr_check_block(inode, bh)) {
+bad_block:
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
 	}
-	ext4_xattr_cache_insert(bh);
+	ext4_xattr_cache_insert(ext4_mb_cache, bh);
 	entry = BFIRST(bh);
 	error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
 	if (error == -EIO)
@@ -253,7 +310,7 @@ cleanup:
 	return error;
 }
 
-static int
+int
 ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 		     void *buffer, size_t buffer_size)
 {
@@ -265,7 +322,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
 	void *end;
 	int error;
 
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
 		return -ENODATA;
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error)
@@ -312,6 +369,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
 {
 	int error;
 
+	if (strlen(name) > 255)
+		return -ERANGE;
+
 	down_read(&EXT4_I(inode)->xattr_sem);
 	error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
 				     buffer_size);
@@ -323,19 +383,20 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
 }
 
 static int
-ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
+ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
 			char *buffer, size_t buffer_size)
 {
 	size_t rest = buffer_size;
 
 	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
-		struct xattr_handler *handler =
+		const struct xattr_handler *handler =
 			ext4_xattr_handler(entry->e_name_index);
 
 		if (handler) {
-			size_t size = handler->list(inode, buffer, rest,
+			size_t size = handler->list(dentry, buffer, rest,
 						    entry->e_name,
-						    entry->e_name_len);
+						    entry->e_name_len,
+						    handler->flags);
 			if (buffer) {
 				if (size > rest)
 					return -ERANGE;
@@ -348,10 +409,12 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
 }
 
 static int
-ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+	struct inode *inode = dentry->d_inode;
 	struct buffer_head *bh = NULL;
 	int error;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
@@ -359,22 +422,22 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 	error = 0;
 	if (!EXT4_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %llu",
+		  (unsigned long long)EXT4_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	error = -EIO;
 	if (!bh)
 		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
-	if (ext4_xattr_check_block(bh)) {
-		ext4_error(inode->i_sb, __FUNCTION__,
-			   "inode %lu: bad block %llu", inode->i_ino,
-			   EXT4_I(inode)->i_file_acl);
+	if (ext4_xattr_check_block(inode, bh)) {
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
 	}
-	ext4_xattr_cache_insert(bh);
-	error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+	ext4_xattr_cache_insert(ext4_mb_cache, bh);
+	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 
 cleanup:
 	brelse(bh);
@@ -383,15 +446,16 @@ cleanup:
 }
 
 static int
-ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+	struct inode *inode = dentry->d_inode;
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_inode *raw_inode;
 	struct ext4_iloc iloc;
 	void *end;
 	int error;
 
-	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
 		return 0;
 	error = ext4_get_inode_loc(inode, &iloc);
 	if (error)
@@ -402,7 +466,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
 	error = ext4_xattr_check_names(IFIRST(header), end);
 	if (error)
 		goto cleanup;
-	error = ext4_xattr_list_entries(inode, IFIRST(header),
+	error = ext4_xattr_list_entries(dentry, IFIRST(header),
 					buffer, buffer_size);
 
 cleanup:
@@ -420,26 +484,26 @@ cleanup:
  * Returns a negative error number on failure, or the number of bytes
  * used / required on success.
  */
-int
-ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+static int
+ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-	int i_error, b_error;
+	int ret, ret2;
 
-	down_read(&EXT4_I(inode)->xattr_sem);
-	i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
-	if (i_error < 0) {
-		b_error = 0;
-	} else {
-		if (buffer) {
-			buffer += i_error;
-			buffer_size -= i_error;
-		}
-		b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
-		if (b_error < 0)
-			i_error = 0;
+	down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+	ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
+	if (ret < 0)
+		goto errout;
+	if (buffer) {
+		buffer += ret;
+		buffer_size -= ret;
 	}
-	up_read(&EXT4_I(inode)->xattr_sem);
-	return i_error + b_error;
+	ret = ext4_xattr_block_list(dentry, buffer, buffer_size);
+	if (ret < 0)
+		goto errout;
+	ret += ret2;
+errout:
+	up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+	return ret;
 }
 
 /*
@@ -452,16 +516,16 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
 		return;
 
+	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
 	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
 		EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
-		sb->s_dirt = 1;
-		ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+		ext4_handle_dirty_super(handle, sb);
 	}
 }
 
 /*
- * Release the xattr block BH: If the reference count is > 1, decrement
- * it; otherwise free the block.
+ * Release the xattr block BH: If the reference count is > 1, decrement it;
+ * otherwise free the block.
  */
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
@@ -469,8 +533,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 {
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
-	ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+	ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
+	BUFFER_TRACE(bh, "get_write_access");
 	error = ext4_journal_get_write_access(handle, bh);
 	if (error)
 		goto out;
@@ -480,22 +546,38 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		ea_bdebug(bh, "refcount now=0; freeing");
 		if (ce)
 			mb_cache_entry_free(ce);
-		ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
 		get_bh(bh);
-		ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+		unlock_buffer(bh);
+		ext4_free_blocks(handle, inode, bh, 0, 1,
+				 EXT4_FREE_BLOCKS_METADATA |
+				 EXT4_FREE_BLOCKS_FORGET);
 	} else {
-		BHDR(bh)->h_refcount = cpu_to_le32(
-				le32_to_cpu(BHDR(bh)->h_refcount) - 1);
-		error = ext4_journal_dirty_metadata(handle, bh);
+		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+		if (ce)
+			mb_cache_entry_release(ce);
+		/*
+		 * Beware of this ugliness: Releasing of xattr block references
+		 * from different inodes can race and so we have to protect
+		 * from a race where someone else frees the block (and releases
+		 * its journal_head) before we are done dirtying the buffer. In
+		 * nojournal mode this race is harmless and we actually cannot
+		 * call ext4_handle_dirty_xattr_block() with locked buffer as
+		 * that function can call sync_dirty_buffer() so for that case
+		 * we handle the dirtying after unlocking the buffer.
+		 */
+		if (ext4_handle_valid(handle))
+			error = ext4_handle_dirty_xattr_block(handle, inode,
+							      bh);
+		unlock_buffer(bh);
+		if (!ext4_handle_valid(handle))
+			error = ext4_handle_dirty_xattr_block(handle, inode,
+							      bh);
 		if (IS_SYNC(inode))
-			handle->h_sync = 1;
-		DQUOT_FREE_BLOCK(inode, 1);
+			ext4_handle_sync(handle);
+		dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
-		if (ce)
-			mb_cache_entry_release(ce);
 	}
-	unlock_buffer(bh);
 out:
 	ext4_std_error(inode->i_sb, error);
 	return;
@@ -509,31 +591,17 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
 				    size_t *min_offs, void *base, int *total)
 {
 	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-		*total += EXT4_XATTR_LEN(last->e_name_len);
 		if (!last->e_value_block && last->e_value_size) {
 			size_t offs = le16_to_cpu(last->e_value_offs);
 			if (offs < *min_offs)
 				*min_offs = offs;
 		}
+		if (total)
+			*total += EXT4_XATTR_LEN(last->e_name_len);
 	}
 	return (*min_offs - ((void *)last - base) - sizeof(__u32));
 }
 
-struct ext4_xattr_info {
-	int name_index;
-	const char *name;
-	const void *value;
-	size_t value_len;
-};
-
-struct ext4_xattr_search {
-	struct ext4_xattr_entry *first;
-	void *base;
-	void *end;
-	struct ext4_xattr_entry *here;
-	int not_found;
-};
-
 static int
 ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 {
@@ -586,9 +654,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 				   size. Just replace. */
 				s->here->e_value_size =
 					cpu_to_le32(i->value_len);
-				memset(val + size - EXT4_XATTR_PAD, 0,
-				       EXT4_XATTR_PAD); /* Clear pad bytes. */
-				memcpy(val, i->value, i->value_len);
+				if (i->value == EXT4_ZERO_XATTR_VALUE) {
+					memset(val, 0, size);
+				} else {
+					/* Clear pad bytes first. */
+					memset(val + size - EXT4_XATTR_PAD, 0,
+					       EXT4_XATTR_PAD);
+					memcpy(val, i->value, i->value_len);
+				}
 				return 0;
 			}
 
@@ -627,9 +700,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
 			size_t size = EXT4_XATTR_SIZE(i->value_len);
 			void *val = s->base + min_offs - size;
 			s->here->e_value_offs = cpu_to_le16(min_offs - size);
-			memset(val + size - EXT4_XATTR_PAD, 0,
-			       EXT4_XATTR_PAD); /* Clear the pad bytes. */
-			memcpy(val, i->value, i->value_len);
+			if (i->value == EXT4_ZERO_XATTR_VALUE) {
+				memset(val, 0, size);
+			} else {
+				/* Clear the pad bytes first. */
+				memset(val + size - EXT4_XATTR_PAD, 0,
+				       EXT4_XATTR_PAD);
+				memcpy(val, i->value, i->value_len);
+			}
 		}
 	}
 	return 0;
@@ -659,10 +737,9 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 		ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
 			atomic_read(&(bs->bh->b_count)),
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
-		if (ext4_xattr_check_block(bs->bh)) {
-			ext4_error(sb, __FUNCTION__,
-				"inode %lu: bad block %llu", inode->i_ino,
-				EXT4_I(inode)->i_file_acl);
+		if (ext4_xattr_check_block(inode, bs->bh)) {
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -693,14 +770,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 	struct ext4_xattr_search *s = &bs->s;
 	struct mb_cache_entry *ce = NULL;
 	int error = 0;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
 	if (i->value && i->value_len > sb->s_blocksize)
 		return -ENOSPC;
 	if (s->base) {
-		ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+		ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
 					bs->bh->b_blocknr);
+		BUFFER_TRACE(bs->bh, "get_write_access");
 		error = ext4_journal_get_write_access(handle, bs->bh);
 		if (error)
 			goto cleanup;
@@ -717,14 +796,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 				if (!IS_LAST_ENTRY(s->first))
 					ext4_xattr_rehash(header(s->base),
 							  s->here);
-				ext4_xattr_cache_insert(bs->bh);
+				ext4_xattr_cache_insert(ext4_mb_cache,
+					bs->bh);
 			}
 			unlock_buffer(bs->bh);
 			if (error == -EIO)
 				goto bad_block;
 			if (!error)
-				error = ext4_journal_dirty_metadata(handle,
-								    bs->bh);
+				error = ext4_handle_dirty_xattr_block(handle,
+								      inode,
+								      bs->bh);
 			if (error)
 				goto cleanup;
 			goto inserted;
@@ -732,13 +813,12 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			int offset = (char *)s->here - bs->bh->b_data;
 
 			unlock_buffer(bs->bh);
-			jbd2_journal_release_buffer(handle, bs->bh);
 			if (ce) {
 				mb_cache_entry_release(ce);
 				ce = NULL;
 			}
 			ea_bdebug(bs->bh, "cloning");
-			s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
+			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
 			error = -ENOMEM;
 			if (s->base == NULL)
 				goto cleanup;
@@ -750,7 +830,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
-		s->base = kzalloc(sb->s_blocksize, GFP_KERNEL);
+		s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
 		/* assert(header == s->base) */
 		error = -ENOMEM;
 		if (s->base == NULL)
@@ -781,21 +861,23 @@ inserted:
 			else {
 				/* The old block is released after updating
 				   the inode. */
-				error = -EDQUOT;
-				if (DQUOT_ALLOC_BLOCK(inode, 1))
+				error = dquot_alloc_block(inode,
+						EXT4_C2B(EXT4_SB(sb), 1));
+				if (error)
 					goto cleanup;
+				BUFFER_TRACE(new_bh, "get_write_access");
 				error = ext4_journal_get_write_access(handle,
 								      new_bh);
 				if (error)
 					goto cleanup_dquot;
 				lock_buffer(new_bh);
-				BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
-					le32_to_cpu(BHDR(new_bh)->h_refcount));
+				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
 				ea_bdebug(new_bh, "reusing; refcount now=%d",
 					le32_to_cpu(BHDR(new_bh)->h_refcount));
 				unlock_buffer(new_bh);
-				error = ext4_journal_dirty_metadata(handle,
-								    new_bh);
+				error = ext4_handle_dirty_xattr_block(handle,
+								      inode,
+								      new_bh);
 				if (error)
 					goto cleanup_dquot;
 			}
@@ -808,34 +890,53 @@ inserted:
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			ext4_fsblk_t goal = le32_to_cpu(
-					EXT4_SB(sb)->s_es->s_first_data_block) +
-				(ext4_fsblk_t)EXT4_I(inode)->i_block_group *
-				EXT4_BLOCKS_PER_GROUP(sb);
-			ext4_fsblk_t block = ext4_new_block(handle, inode,
-							goal, &error);
+			ext4_fsblk_t goal, block;
+
+			goal = ext4_group_first_block_no(sb,
+						EXT4_I(inode)->i_block_group);
+
+			/* non-extent files can't have physical blocks past 2^32 */
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+			/*
+			 * take i_data_sem because we will test
+			 * i_delalloc_reserved_flag in ext4_mb_new_blocks
+			 */
+			down_read(&EXT4_I(inode)->i_data_sem);
+			block = ext4_new_meta_blocks(handle, inode, goal, 0,
+						     NULL, &error);
+			up_read((&EXT4_I(inode)->i_data_sem));
 			if (error)
 				goto cleanup;
-			ea_idebug(inode, "creating block %d", block);
+
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
+			ea_idebug(inode, "creating block %llu",
+				  (unsigned long long)block);
 
 			new_bh = sb_getblk(sb, block);
-			if (!new_bh) {
+			if (unlikely(!new_bh)) {
+				error = -ENOMEM;
 getblk_failed:
-				ext4_free_blocks(handle, inode, block, 1, 1);
-				error = -EIO;
+				ext4_free_blocks(handle, inode, NULL, block, 1,
+						 EXT4_FREE_BLOCKS_METADATA);
 				goto cleanup;
 			}
 			lock_buffer(new_bh);
 			error = ext4_journal_get_create_access(handle, new_bh);
 			if (error) {
 				unlock_buffer(new_bh);
+				error = -EIO;
 				goto getblk_failed;
 			}
 			memcpy(new_bh->b_data, s->base, new_bh->b_size);
 			set_buffer_uptodate(new_bh);
 			unlock_buffer(new_bh);
-			ext4_xattr_cache_insert(new_bh);
-			error = ext4_journal_dirty_metadata(handle, new_bh);
+			ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
+			error = ext4_handle_dirty_xattr_block(handle,
+							      inode, new_bh);
 			if (error)
 				goto cleanup;
 		}
@@ -859,26 +960,19 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	DQUOT_FREE_BLOCK(inode, 1);
+	dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1));
 	goto cleanup;
 
 bad_block:
-	ext4_error(inode->i_sb, __FUNCTION__,
-		   "inode %lu: bad block %llu", inode->i_ino,
-		   EXT4_I(inode)->i_file_acl);
+	EXT4_ERROR_INODE(inode, "bad block %llu",
+			 EXT4_I(inode)->i_file_acl);
 	goto cleanup;
 
 #undef header
 }
 
-struct ext4_xattr_ibody_find {
-	struct ext4_xattr_search s;
-	struct ext4_iloc iloc;
-};
-
-static int
-ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
-		      struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+			  struct ext4_xattr_ibody_find *is)
 {
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_inode *raw_inode;
@@ -891,7 +985,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 	is->s.base = is->s.first = IFIRST(header);
 	is->s.here = is->s.first;
 	is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
-	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
 		error = ext4_xattr_check_names(IFIRST(header), is->s.end);
 		if (error)
 			return error;
@@ -906,10 +1000,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
 	return 0;
 }
 
-static int
-ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
-		     struct ext4_xattr_info *i,
-		     struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+				struct ext4_xattr_info *i,
+				struct ext4_xattr_ibody_find *is)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_search *s = &is->s;
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return -ENOSPC;
+	error = ext4_xattr_set_entry(i, s);
+	if (error) {
+		if (error == -ENOSPC &&
+		    ext4_has_inline_data(inode)) {
+			error = ext4_try_to_evict_inline_data(handle, inode,
+					EXT4_XATTR_LEN(strlen(i->name) +
+					EXT4_XATTR_SIZE(i->value_len)));
+			if (error)
+				return error;
+			error = ext4_xattr_ibody_find(inode, i, is);
+			if (error)
+				return error;
+			error = ext4_xattr_set_entry(i, s);
+		}
+		if (error)
+			return error;
+	}
+	header = IHDR(inode, ext4_raw_inode(&is->iloc));
+	if (!IS_LAST_ENTRY(s->first)) {
+		header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+	} else {
+		header->h_magic = cpu_to_le32(0);
+		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+	}
+	return 0;
+}
+
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+				struct ext4_xattr_info *i,
+				struct ext4_xattr_ibody_find *is)
 {
 	struct ext4_xattr_ibody_header *header;
 	struct ext4_xattr_search *s = &is->s;
@@ -923,10 +1054,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 	header = IHDR(inode, ext4_raw_inode(&is->iloc));
 	if (!IS_LAST_ENTRY(s->first)) {
 		header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
-		EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 	} else {
 		header->h_magic = cpu_to_le32(0);
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
 	}
 	return 0;
 }
@@ -934,7 +1065,7 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 /*
  * ext4_xattr_set_handle()
  *
- * Create, replace or remove an extended attribute for this inode. Buffer
+ * Create, replace or remove an extended attribute for this inode.  Value
  * is NULL to remove an existing extended attribute, and non-NULL to
  * either replace an existing extended attribute, or create a new extended
  * attribute. The flags XATTR_REPLACE and XATTR_CREATE
@@ -961,6 +1092,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	struct ext4_xattr_block_find bs = {
 		.s = { .not_found = -ENODATA, },
 	};
+	unsigned long no_expand;
 	int error;
 
 	if (!name)
@@ -968,14 +1100,17 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	if (strlen(name) > 255)
 		return -ERANGE;
 	down_write(&EXT4_I(inode)->xattr_sem);
-	error = ext4_get_inode_loc(inode, &is.iloc);
+	no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+
+	error = ext4_reserve_inode_write(handle, inode, &is.iloc);
 	if (error)
 		goto cleanup;
 
-	if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+	if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
 		struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
-		EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+		ext4_clear_inode_state(inode, EXT4_STATE_NEW);
 	}
 
 	error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -997,9 +1132,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (flags & XATTR_CREATE)
 			goto cleanup;
 	}
-	error = ext4_journal_get_write_access(handle, is.iloc.bh);
-	if (error)
-		goto cleanup;
 	if (!value) {
 		if (!is.s.not_found)
 			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1011,6 +1143,11 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 			i.value = NULL;
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 		} else if (error == -ENOSPC) {
+			if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
+				error = ext4_xattr_block_find(inode, &i, &bs);
+				if (error)
+					goto cleanup;
+			}
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 			if (error)
 				goto cleanup;
@@ -1025,7 +1162,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		ext4_xattr_update_super_block(handle, inode->i_sb);
 		inode->i_ctime = ext4_current_time(inode);
 		if (!value)
-			EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+			ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 		/*
 		 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1033,12 +1170,14 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		 */
 		is.iloc.bh = NULL;
 		if (IS_SYNC(inode))
-			handle->h_sync = 1;
+			ext4_handle_sync(handle);
 	}
 
 cleanup:
 	brelse(is.iloc.bh);
 	brelse(bs.bh);
+	if (no_expand == 0)
+		ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
 	up_write(&EXT4_I(inode)->xattr_sem);
 	return error;
 }
@@ -1057,9 +1196,10 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
 {
 	handle_t *handle;
 	int error, retries = 0;
+	int credits = ext4_jbd2_credits_xattr(inode);
 
 retry:
-	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+	handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
 	} else {
@@ -1117,7 +1257,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	struct ext4_xattr_block_find *bs = NULL;
 	char *buffer = NULL, *b_entry_name = NULL;
 	size_t min_offs, free;
-	int total_ino, total_blk;
+	int total_ino;
 	void *base, *start, *end;
 	int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
 	int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1165,10 +1305,9 @@ retry:
 		error = -EIO;
 		if (!bh)
 			goto cleanup;
-		if (ext4_xattr_check_block(bh)) {
-			ext4_error(inode->i_sb, __FUNCTION__,
-				"inode %lu: bad block %llu", inode->i_ino,
-				EXT4_I(inode)->i_file_acl);
+		if (ext4_xattr_check_block(inode, bh)) {
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -1176,8 +1315,7 @@ retry:
 		first = BFIRST(bh);
 		end = bh->b_data + bh->b_size;
 		min_offs = end - base;
-		free = ext4_xattr_free_space(first, &min_offs, base,
-					     &total_blk);
+		free = ext4_xattr_free_space(first, &min_offs, base, NULL);
 		if (free < new_extra_isize) {
 			if (!tried_min_extra_isize && s_min_extra_isize) {
 				tried_min_extra_isize++;
@@ -1240,6 +1378,9 @@ retry:
 				    s_min_extra_isize) {
 					tried_min_extra_isize++;
 					new_extra_isize = s_min_extra_isize;
+					kfree(is); is = NULL;
+					kfree(bs); bs = NULL;
+					brelse(bh);
 					goto retry;
 				}
 				error = -1;
@@ -1273,6 +1414,8 @@ retry:
 
 		/* Remove the chosen entry from the inode */
 		error = ext4_xattr_ibody_set(handle, inode, &i, is);
+		if (error)
+			goto cleanup;
 
 		entry = IFIRST(header);
 		if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
@@ -1303,6 +1446,8 @@ retry:
 			goto cleanup;
 		kfree(b_entry_name);
 		kfree(buffer);
+		b_entry_name = NULL;
+		buffer = NULL;
 		brelse(is->iloc.bh);
 		kfree(is);
 		kfree(bs);
@@ -1341,16 +1486,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
 		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh) {
-		ext4_error(inode->i_sb, __FUNCTION__,
-			"inode %lu: block %llu read error", inode->i_ino,
-			EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "block %llu read error",
+				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		ext4_error(inode->i_sb, __FUNCTION__,
-			"inode %lu: bad block %llu", inode->i_ino,
-			EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	ext4_xattr_release_block(handle, inode, bh);
@@ -1380,18 +1523,18 @@ ext4_xattr_put_super(struct super_block *sb)
  * Returns 0, or a negative error number on failure.
  */
 static void
-ext4_xattr_cache_insert(struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
 {
 	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
 	struct mb_cache_entry *ce;
 	int error;
 
-	ce = mb_cache_entry_alloc(ext4_xattr_cache);
+	ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
 	if (!ce) {
 		ea_bdebug(bh, "out of memory");
 		return;
 	}
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
 	if (error) {
 		mb_cache_entry_free(ce);
 		if (error == -EBUSY) {
@@ -1458,13 +1601,14 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
 {
 	__u32 hash = le32_to_cpu(header->h_hash);
 	struct mb_cache_entry *ce;
+	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	if (!header->h_hash)
 		return NULL;  /* never share */
 	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-	ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
-				       inode->i_sb->s_bdev, hash);
+	ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
+				       hash);
 	while (ce) {
 		struct buffer_head *bh;
 
@@ -1475,9 +1619,8 @@ again:
 		}
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
-			ext4_error(inode->i_sb, __FUNCTION__,
-				"inode %lu: block %lu read error",
-				inode->i_ino, (unsigned long) ce->e_block);
+			EXT4_ERROR_INODE(inode, "block %lu read error",
+					 (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
 				EXT4_XATTR_REFCOUNT_MAX) {
 			ea_idebug(inode, "block %lu refcount %d>=%d",
@@ -1489,7 +1632,7 @@ again:
 			return bh;
 		}
 		brelse(bh);
-		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
 }
@@ -1509,7 +1652,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
 	char *name = entry->e_name;
 	int n;
 
-	for (n=0; n < entry->e_name_len; n++) {
+	for (n = 0; n < entry->e_name_len; n++) {
 		hash = (hash << NAME_HASH_SHIFT) ^
 		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
 		       *name++;
@@ -1562,21 +1705,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 
 #undef BLOCK_HASH_SHIFT
 
-int __init
-init_ext4_xattr(void)
+#define	HASH_BUCKET_BITS	10
+
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
 {
-	ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
-		sizeof(struct mb_cache_entry) +
-		sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
-	if (!ext4_xattr_cache)
-		return -ENOMEM;
-	return 0;
+	return mb_cache_create(name, HASH_BUCKET_BITS);
 }
 
-void
-exit_ext4_xattr(void)
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
 {
-	if (ext4_xattr_cache)
-		mb_cache_destroy(ext4_xattr_cache);
-	ext4_xattr_cache = NULL;
+	if (cache)
+		mb_cache_destroy(cache);
 }
+
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index d7f5d6a1265..29bedf5589f 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,13 +21,17 @@
 #define EXT4_XATTR_INDEX_TRUSTED		4
 #define	EXT4_XATTR_INDEX_LUSTRE			5
 #define EXT4_XATTR_INDEX_SECURITY	        6
+#define EXT4_XATTR_INDEX_SYSTEM			7
+#define EXT4_XATTR_INDEX_RICHACL		8
 
 struct ext4_xattr_header {
 	__le32	h_magic;	/* magic number for identification */
 	__le32	h_refcount;	/* reference count */
 	__le32	h_blocks;	/* number of disk blocks used */
 	__le32	h_hash;		/* hash value of all attributes */
-	__u32	h_reserved[4];	/* zero right now */
+	__le32	h_checksum;	/* crc32c(uuid+id+xattrblock) */
+				/* id = inum if refcount=1, blknum otherwise */
+	__u32	h_reserved[3];	/* zero right now */
 };
 
 struct ext4_xattr_ibody_header {
@@ -51,8 +55,8 @@ struct ext4_xattr_entry {
 	(((name_len) + EXT4_XATTR_ROUND + \
 	sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
 #define EXT4_XATTR_NEXT(entry) \
-	( (struct ext4_xattr_entry *)( \
-	  (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+	((struct ext4_xattr_entry *)( \
+	 (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
 #define EXT4_XATTR_SIZE(size) \
 	(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
 
@@ -63,18 +67,40 @@ struct ext4_xattr_entry {
 		EXT4_I(inode)->i_extra_isize))
 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
 
-# ifdef CONFIG_EXT4DEV_FS_XATTR
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
 
-extern struct xattr_handler ext4_xattr_user_handler;
-extern struct xattr_handler ext4_xattr_trusted_handler;
-extern struct xattr_handler ext4_xattr_acl_access_handler;
-extern struct xattr_handler ext4_xattr_acl_default_handler;
-extern struct xattr_handler ext4_xattr_security_handler;
+#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+
+struct ext4_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ext4_xattr_search {
+	struct ext4_xattr_entry *first;
+	void *base;
+	void *end;
+	struct ext4_xattr_entry *here;
+	int not_found;
+};
+
+struct ext4_xattr_ibody_find {
+	struct ext4_xattr_search s;
+	struct ext4_iloc iloc;
+};
+
+extern const struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
 
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
-extern int ext4_xattr_list(struct inode *, char *, size_t);
 extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
 extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
 
@@ -84,78 +110,26 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
 
-extern int init_ext4_xattr(void);
-extern void exit_ext4_xattr(void);
-
-extern struct xattr_handler *ext4_xattr_handlers[];
-
-# else  /* CONFIG_EXT4DEV_FS_XATTR */
-
-static inline int
-ext4_xattr_get(struct inode *inode, int name_index, const char *name,
-	       void *buffer, size_t size, int flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_list(struct inode *inode, void *buffer, size_t size)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set(struct inode *inode, int name_index, const char *name,
-	       const void *value, size_t size, int flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
-	       const char *name, const void *value, size_t size, int flags)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
-{
-}
-
-static inline void
-ext4_xattr_put_super(struct super_block *sb)
-{
-}
-
-static inline int
-init_ext4_xattr(void)
-{
-	return 0;
-}
-
-static inline void
-exit_ext4_xattr(void)
-{
-}
-
-static inline int
-ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
-			    struct ext4_inode *raw_inode, handle_t *handle)
-{
-	return -EOPNOTSUPP;
-}
+extern const struct xattr_handler *ext4_xattr_handlers[];
 
-#define ext4_xattr_handlers	NULL
+extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+				 struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
+				const char *name,
+				void *buffer, size_t buffer_size);
+extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+				       struct ext4_xattr_info *i,
+				       struct ext4_xattr_ibody_find *is);
 
-# endif  /* CONFIG_EXT4DEV_FS_XATTR */
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
 
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
-				struct inode *dir);
+			      struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext4_init_security(handle_t *handle, struct inode *inode,
-				struct inode *dir)
+				     struct inode *dir, const struct qstr *qstr)
 {
 	return 0;
 }
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index f17eaf2321b..d2a200624af 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -3,17 +3,17 @@
  * Handler for storing security labels as extended attributes.
  */
 
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 
 static size_t
-ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
-			 const char *name, size_t name_len)
+ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
 {
 	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
 	const size_t total_len = prefix_len + name_len + 1;
@@ -28,47 +28,53 @@ ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 }
 
 static int
-ext4_xattr_security_get(struct inode *inode, const char *name,
-		       void *buffer, size_t size)
+ext4_xattr_security_get(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size, int type)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
-			      buffer, size);
+	return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+			      name, buffer, size);
 }
 
 static int
-ext4_xattr_security_set(struct inode *inode, const char *name,
-		       const void *value, size_t size, int flags)
+ext4_xattr_security_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
-			      value, size, flags);
+	return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+			      name, value, size, flags);
 }
 
-int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+static int
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+		void *fs_info)
 {
-	int err;
-	size_t len;
-	void *value;
-	char *name;
+	const struct xattr *xattr;
+	handle_t *handle = fs_info;
+	int err = 0;
 
-	err = security_inode_init_security(inode, dir, &name, &value, &len);
-	if (err) {
-		if (err == -EOPNOTSUPP)
-			return 0;
-		return err;
+	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+		err = ext4_xattr_set_handle(handle, inode,
+					    EXT4_XATTR_INDEX_SECURITY,
+					    xattr->name, xattr->value,
+					    xattr->value_len, 0);
+		if (err < 0)
+			break;
 	}
-	err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
-				    name, value, len, 0);
-	kfree(name);
-	kfree(value);
 	return err;
 }
 
-struct xattr_handler ext4_xattr_security_handler = {
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
+{
+	return security_inode_init_security(inode, dir, qstr,
+					    &ext4_initxattrs, handle);
+}
+
+const struct xattr_handler ext4_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext4_xattr_security_list,
 	.get	= ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index e0f05acdafe..95f1f4ab59a 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -5,21 +5,18 @@
  * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
  */
 
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t
-ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
-			const char *name, size_t name_len)
+ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+		const char *name, size_t name_len, int type)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -34,26 +31,26 @@ ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 }
 
 static int
-ext4_xattr_trusted_get(struct inode *inode, const char *name,
-		       void *buffer, size_t size)
+ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
+		size_t size, int type)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
-			      buffer, size);
+	return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+			      name, buffer, size);
 }
 
 static int
-ext4_xattr_trusted_set(struct inode *inode, const char *name,
-		       const void *value, size_t size, int flags)
+ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags, int type)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
-			      value, size, flags);
+	return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+			      name, value, size, flags);
 }
 
-struct xattr_handler ext4_xattr_trusted_handler = {
+const struct xattr_handler ext4_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.list	= ext4_xattr_trusted_list,
 	.get	= ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 7ed3d8ebf09..0edb7611ffb 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -5,23 +5,20 @@
  * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
  */
 
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
-#include <linux/ext4_jbd2.h>
-#include <linux/ext4_fs.h>
+#include "ext4_jbd2.h"
+#include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_USER_PREFIX "user."
-
 static size_t
-ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
-		     const char *name, size_t name_len)
+ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+		     const char *name, size_t name_len, int type)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
-	if (!test_opt(inode->i_sb, XATTR_USER))
+	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return 0;
 
 	if (list && total_len <= list_size) {
@@ -33,29 +30,30 @@ ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 }
 
 static int
-ext4_xattr_user_get(struct inode *inode, const char *name,
-		    void *buffer, size_t size)
+ext4_xattr_user_get(struct dentry *dentry, const char *name,
+		    void *buffer, size_t size, int type)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	if (!test_opt(inode->i_sb, XATTR_USER))
+	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
+	return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+			      name, buffer, size);
 }
 
 static int
-ext4_xattr_user_set(struct inode *inode, const char *name,
-		    const void *value, size_t size, int flags)
+ext4_xattr_user_set(struct dentry *dentry, const char *name,
+		    const void *value, size_t size, int flags, int type)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	if (!test_opt(inode->i_sb, XATTR_USER))
+	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
-			      value, size, flags);
+	return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+			      name, value, size, flags);
 }
 
-struct xattr_handler ext4_xattr_user_handler = {
+const struct xattr_handler ext4_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.list	= ext4_xattr_user_list,
 	.get	= ext4_xattr_user_get,